This is an R Markdown Notebook for analysis using data on the DC Bus System (WMATA Metrobus). The data were obtained here:

https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/

Load the packages to be used.

Get the Bus data.

First let’s check the working directory.


getwd()

Then, actually get the data.

Put the daily data together.


AllDays <- bind_rows(list(Oct03Raw, Oct04Raw, Oct05Raw, Oct06Raw, Oct07Raw),
                     .id = c("group")
                    )
# dim(AllDays)
str(AllDays)

Deleting old data frames.


for (i in 3:7){
  rm(list = ls(pattern = paste0("Oct0", i, "Raw")
              )
    )
  
  message("Deleting Oct0", i, "Raw")
  }

Updating variable types.

Then, sorting the data and adding a RowNumber (to be used for identifying rows later in the analyses.)


rm(i)


AllDays$group <- factor(AllDays$group)
AllDays$Route_Direction <- factor(AllDays$Route_Direction)
AllDays$Event_Time <- as.POSIXct(AllDays$Event_Time, format = "%m-%d-%y %I:%M:%S %p")
AllDays$Departure_Time <- as.POSIXct(AllDays$Departure_Time, format = "%m-%d-%y %I:%M:%S %p")

str(AllDays)


AllDays_Sorted <- arrange(AllDays,
                          Bus_ID,
                          Event_Time
                         ) %>% 
  mutate(RowNum_OG = row_number() # this is useful in identify the row later on
        )

rm(AllDays)
str(AllDays_Sorted)

# View(head(AllDays_Sorted, 100))

Inspecting the values of Stop_ID, and finding that it can take the values “” (blank) and “NULL”.


View(group_by(AllDays_Sorted,
              Stop_ID
             ) %>% 
       summarise(
         Cnt = n()
         ) %>% 
       arrange(Stop_ID)
    )

View(filter(AllDays_Sorted,
            is.na(Stop_ID) |
              Stop_ID == "" |
              Stop_ID == "NULL"
           ) %>% 
       arrange(Stop_Desc)
    )

Creating a table of distinct Stop_Desc values when Stop_ID is “” (blank) or “NULL”.


StopID_New <- filter(AllDays_Sorted,
                     is.na(Stop_ID) |
                       Stop_ID == "" |
                       Stop_ID == "NULL"
                    ) %>% 
  select(Stop_ID, Stop_Desc) %>% 
  distinct() %>% 
  arrange(Stop_ID, Stop_Desc) %>% 
  mutate(StopID_New = 1:nrow(.)
        )

View(StopID_New)
StopID_New

Creating a full updated table by filling in StopID_New for when Stop_ID is “” (blank) or NULL.


AllDays_StopIDNew <- left_join(AllDays_Sorted,
                               select(StopID_New,
                                      Stop_Desc,
                                      StopID_New
                                     ),
                               by = c("Stop_Desc" = "Stop_Desc")
                              ) %>% 
  mutate(StopID_Clean = ifelse(is.na(StopID_New),
                               Stop_ID,
                               StopID_New
                              ),
         StopID_Indicator = factor(ifelse(is.na(StopID_New),
                                          "ID_OK",
                                          "ID_Bad"
                                         )
                                  )
        )

rm(StopID_New)
rm(AllDays_Sorted)
str(AllDays_StopIDNew)

# View(tail(AllDays_StopIDNew, 500))
# View(filter(AllDays_StopIDNew,
#             Stop_Desc == "METROWAY ANNNOUCEMNT CORR"
#            )
#     )

Lat Long stats for pulling in Zip codes later.


LL_Stats <- group_by(AllDays_StopIDNew,
                     StopID_Clean
                    ) %>% 
  summarise(Lat_Mean = mean(Latitude, na.rm = TRUE),
            Lat_Med = median(Latitude, na.rm = TRUE),
            Lng_Mean = mean(Longitude, na.rm = TRUE),
            Lng_Med = median(Longitude, na.rm = TRUE)
           ) %>% 
  mutate(Lat_MeaLessMed = Lat_Mean - Lat_Med,
         Lng_MeaLessMed = Lng_Mean - Lng_Med,
         RowNum = row_number()
        )

str(LL_Stats)
summary(LL_Stats)

View(head(arrange(LL_Stats,
                  Lat_MeaLessMed
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  desc(Lat_MeaLessMed)
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  Lng_MeaLessMed
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  desc(Lng_MeaLessMed)
                 ),
          500
         )
    )

Pulling in Zip Code data from api.geonames.org.


# URL EXAMPLE:
# http://api.geonames.org/findNearbyPostalCodesJSON?lat=38.89560&lng=-76.94873&radius=0&username=supermdat

url_1 <- "http://api.geonames.org/findNearbyPostalCodesJSON?lat="
url_2 <- "&lng="
url_3 <- "&radius=0&username="
username <- "supermdat"


# need to group in bunches as http://api.geonames.org limits pulls to 2000 per hour


##### Store everything in multiple lists
pages1 <- list()


system.time(

for(i in 1:1000){
  lat <- filter(LL_Stats,
                RowNum == i
               ) %>%
    select(Lat_Med)
  
  lng <- filter(LL_Stats,
                RowNum == i
               ) %>%
    select(Lng_Med)
  
  APIData1 <- fromJSON(paste0(url_1,
                              lat,
                              url_2,
                              lng,
                              url_3,
                              username
                             ),
                       flatten = TRUE
                      )
  
  message("Retrieving Zip Code ", i)
  
  pages1[[i]] <- APIData1$postalCodes
  
}
)


##### Combine the lists into one page
Zips1 <- rbind.pages(pages1[sapply(pages1, length) > 0])


##### Combine all pages
Zips_All <- bind_rows(Zips0,
                      Zips1,
                      Zips2,
                      Zips3,
                      Zips4,
                      Zips5,
                      Zips6,
                      Zips7,
                      Zips8,
                      Zips9,
                      Zips10,
                      .id = "id"
                     ) %>% 
  mutate(UniqueLatLng = paste(lat, lng, sep = "__")
        )

# str(Zips_All)
# View(head(Zips_All))


# str(LL_Stats)
LL_Stats_UnqLatLng <- mutate(LL_Stats,
                             UniqueLatLng = paste(Lat_Med, Lng_Med, sep = "__")
                            )

# str(LL_Stats_UnqLatLng)
# View(head(LL_Stats_UnqLatLng))


LL_StatsZips <- left_join(LL_Stats_UnqLatLng,
                          Zips_All,
                          by = c("UniqueLatLng" = "UniqueLatLng")
                         )

str(LL_StatsZips)
# View(head(LL_StatsZips))

# Not sure whey these couldn't be found (why they're NA)
View(filter(LL_StatsZips,
            is.na(postalCode)
           )
    )

Join to create one dataset that also includes Zip variables.


rm(url_1, url_2, url_3, username, pages0, pages1, pages2, pages3, pages4, pages5, pages6, pages7, pages8, pages9, pages10, i, lat, lng, APIData0, APIData1, APIData2, APIData3, APIData4, APIData5, APIData6, APIData7, APIData8, APIData9, APIData10, LL_Stats, LL_Stats_UnqLatLng)


AllDays_Zips <- left_join(AllDays_StopIDNew,
                          LL_StatsZips,
                          by = c("StopID_Clean" = "StopID_Clean")
                         ) %>% 
  rename(Stop_State = adminCode1,
         Stop_County = adminName2,
         Stop_City = placeName,
         Stop_Zip = postalCode
         )

rm(AllDays_StopIDNew, LL_StatsZips)
str(AllDays_Zips)

Updating variable types.


AllDays_Zips$Stop_State <- factor(AllDays_Zips$Stop_State)
AllDays_Zips$Stop_County <- factor(AllDays_Zips$Stop_County)
AllDays_Zips$Stop_Zip <- factor(AllDays_Zips$Stop_Zip)
AllDays_Zips$Stop_City <- factor(AllDays_Zips$Stop_City)

AllDays_Zips$distance <- as.numeric(AllDays_Zips$distance)
AllDays_Zips$countryCode <- factor(AllDays_Zips$countryCode)
AllDays_Zips$adminName1 <- factor(AllDays_Zips$adminName1)

str(AllDays_Zips)

Feature engineering.

Inspecting incidences of consecutive Stop_IDs. This is done because investigation showed that many conseutive events occurr at the same Stop_ID, but with various Dwell_Times, Odometer_Distances, etc. All of which affect calculations and analyses.

Create data on the runs (consecutive Stop_IDs).


StopID_Runs <- rle(AllDays_Zips$StopID_Clean)

StopID_Runs$ends <- cumsum(StopID_Runs$lengths)

StopID_Runs$starts <- ifelse(is.na(lag(StopID_Runs$ends)
                                  ),
                             1,
                             lag(StopID_Runs$ends) + 1
                            )

str(StopID_Runs)
# class(StopID_Runs)
# 
# StopID_Runs_df <- data.frame(unclass(StopID_Runs))
# str(StopID_Runs_df)
# class(StopID_Runs_df)
# rm(StopID_Runs_df)

Trying to link data on RunsGroups with the original data (AllDays_Sorted). The goal is to select only one record per RunsGroup - that being the record with the longest Dwell_Time.

I attempted this computation using both data.frames (dplyr) and data.tables (data.table). However, with 2,809,062 rows in one dataset and 3,119,443 rows in the other dataset, the current computation time is over 5 days…so I’m trying a different strategy to only select the first record in a run.


# Create a RunsGroup variable for each run
# StopID_Runs_df$RunsGroup <- paste0("g", seq(1:nrow(StopID_Runs_df)
#                                            )
#                                   )
# 
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
# 
# StopID_Runs_df <- StopID_Runs_df %>% 
#   mutate(RowNum = row_number()
#         )
# 
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
# 
# 
# # Converting to data.tables for, hopefully, improved performance (speed) in computation
# StopID_Runs_dt <- data.table(StopID_Runs_df)
# setkey(StopID_Runs_dt, RowNum)
# str(StopID_Runs_dt)
# 
# AllDays_Sorted_dt <- data.table(AllDays_Sorted)
# setkey(AllDays_Sorted_dt, RowNum_OG)
# str(AllDays_Sorted_dt)
# # rm(AllDays_Sorted_dt)
# 
# 
# # Actual loop to perform the computations and link to original data (AllDays_Sorted_dt)
# GroupData <- list()
# for(i in 1:nrow(StopID_Runs_dt)
#    ) {
#   assign(paste0("group_", i),
#            StopID_Runs_dt[RowNum == i, RunsGroup]
#           )
# 
#     #####  The code below is the same code as above, but done with dplyr  #####
# 
#     # assign(paste0("group_", i),
#   #        filter(StopID_Runs_df,
#   #               RowNum == i
#   #              ) %>% 
#   #          select(RunsGroup)
#   #       )
# 
#   assign(paste0("group_", i, "_start"),
#          StopID_Runs_dt[RowNum == i, starts]
#         )
# 
#   assign(paste0("group_", i, "_end"),
#          StopID_Runs_dt[RowNum == i, ends]
#         )
# 
#   assign(paste0("group_", i, "_rows"),
#          AllDays_Sorted_dt[RowNum_OG >= as.numeric(get(paste0("group_", i, "_start")
#                                                       )
#                                                   ) &
#                            RowNum_OG <= as.numeric(get(paste0("group_", i, "_end")
#                                                       )
#                                                   ),
#                            RunsGroup := as.character(get(paste0("group_", i)
#                                                         )
#                                                     )
#                           ]
# 
#     #####  The code below is the same as the code above, but done with dplyr  #####
# 
#          # filter(AllDays_Sorted,
#          #        between(RowNum_OG,
#          #                as.numeric(get(paste0("group_", i, "_start")
#          #                              )
#          #                          ),
#          #                as.numeric(get(paste0("group_", i, "_end")
#          #                              )
#          #                          )
#          #               )
#          #       ) %>% 
#          #   mutate(RunsGroup = as.character(get(paste0("group_", i)
#          #                                     )
#          #                                 )
#          #        )
#         )
# 
#   GroupData[[i]] <- get(paste0("group_", i, "_rows"))
# 
#   message("Processing Group ", i, " of 2,809,062")
# }
# 
# 
# GroupData_df <- rbind.fill(GroupData)
# str(GroupData_df)
# head(GroupData_df)
# tail(GroupData_df)
# # rm(GroupData_df)
# 
# 
# group_1
# group_1_start
# group_1_end
# group_1_rows
# group_2_rows
# group_3_rows
# group_50_rows
# str(group_50_rows)
# group_2809062_rows
# GroupData[[1]]
# GroupData[[50]]
# 
# 
# #####  Testing Area (Below)  #####
# #####  Testing Area (Below)  #####
# #####  Testing Area (Below)  #####
# 
# # head(StopID_Runs$starts, 20)
# # head(AllDays_NewOrder$Stop_ID, 20)
# # 
# # 
# # dat <- as.data.frame(c(1,1,7,7,7,9,6,8,2,2,2,1,1,1,1,1))
# # colnames(dat)[1] <- "dat"
# # r <- rle(dat$dat)
# # dat$run <- rep(r$lengths, r$lengths)
# # dat$runLag <- lag(dat$run)
# # dat$cond <- rep(r$values, r$lengths)
# # dat
# # View(dat)

When consecutive Stop_ID occurrs, only take the first occurrence. This is done because the computation time to select only the record with the longest Dwell_Time for each run was too long (over 5 days).

This is probably less than ideal with regards to Dwell_Time, but should not make much difference for calculations of travel time, speed, etc.


AllDays_FirstStopID <- AllDays_Zips[StopID_Runs$starts, ]

dim(AllDays_Zips)
dim(AllDays_FirstStopID)

nrow(AllDays_Zips) - nrow(AllDays_FirstStopID)

rm(AllDays_Zips, StopID_Runs)
str(AllDays_FirstStopID)

Feature engineering.

Creating new variables.


AllDays_AddVars <- mutate(AllDays_FirstStopID,
                          Odometer_Distance_Mi = Odometer_Distance / 5280, #5,280 feet in 1 mile
                          Dwell_Time2 = as.numeric(Departure_Time - Event_Time),
                          Event_Time_Yr = as.integer(year(Event_Time)),
                          Event_Time_Mth = as.integer(month(Event_Time)),
                          Event_Time_Date = day(Event_Time),
                          Event_Time_Day = wday(Event_Time, label = TRUE),
                          Event_Time_Hr = hour(Event_Time),
                          Event_Time_Min = minute(Event_Time),
                          Event_Time_HrGroup = factor(ifelse(Event_Time_Hr < 3,
                                                             "Group0_2",
                                                      ifelse(Event_Time_Hr < 6,
                                                             "Group3_5",
                                                      ifelse(Event_Time_Hr < 9,
                                                             "Group6_8",
                                                      ifelse(Event_Time_Hr < 12,
                                                             "Group9_11",
                                                      ifelse(Event_Time_Hr < 15,
                                                             "Group12_14",
                                                      ifelse(Event_Time_Hr < 18,
                                                             "Group15_17",
                                                      ifelse(Event_Time_Hr < 21,
                                                             "Group18_20",
                                                      ifelse(Event_Time_Hr < 24,
                                                             "Group21_23"
                                                            )))))))),
                                                         levels = c("Group0_2",
                                                                    "Group3_5",
                                                                    "Group6_8",
                                                                    "Group9_11",
                                                                    "Group12_14",
                                                                    "Group15_17",
                                                                    "Group18_20",
                                                                    "Group21_23"
                                                                   ),
                                                         ordered = TRUE
                                                     )
                         )

rm(AllDays_FirstStopID)
str(AllDays_AddVars)

Function for calculating the distance traveled based on the Haversine formula. Original code from: https://www.r-bloggers.com/great-circle-distance-calculations-in-r/


# Calculates the geodesic distance between two points specified by radian latitude/longitude using the Haversine formula (hf)
# gcd.hf <- function(long1, lat1, long2, lat2) {
#   R <- 6371 # Earth mean radius [km]
#   delta.long <- (long2 - long1)
#   delta.lat <- (lat2 - lat1)
#   a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
#   c <- 2 * asin(min(1,sqrt(a)))
#   d = R * c * 0.621371 # 1 km = 0.621371 miles
#   return(d) # Distance in miles
# }

Feature engineering.

Creating more variables. Creating a BusEvent row number for future identification purposes. Then, creating various variables to analyze distance traveled and speed.


AllDays_BusDay <- group_by(AllDays_AddVars,
                           Bus_ID,
                           Event_Time_Date
                          ) %>% 
  mutate(BusDay_EventNum = row_number(),  # used to identify Bus movements on a particular date
         
         Route_Lag1 = lag(Route),  # used in future analyses to identify Route changes
         RouteAlt_Lag1 = lag(RouteAlt),  # used in future analyses to identify RouteAlt (direction) changes
         
         Odometer_Distance_Lag1 = lag(Odometer_Distance),
         
         Latitude_L1 = lag(Latitude),
         Longitude_L1 = lag(Longitude),
         # Lat_Radian = Latitude*pi/180,
         # Long_Radian = Longitude*pi/180,
         # Lat_Radian_L1 = lag(Lat_Radian),
         # Long_Radian_L1 = lag(Long_Radian),
         
         # accounting for potential negative distances
         TravelDistance_Ft = ifelse(Odometer_Distance > Odometer_Distance_Lag1,
                                    Odometer_Distance - Odometer_Distance_Lag1,
                                    NA
                                   ),
         TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
         
         # TravelDistance_Mi2 = gcd.hf(long1 = Long_Radian_L1,
         #                             lat1 = Lat_Radian_L1,
         #                             long2 = Long_Radian,
         #                             lat2 = Lat_Radian
         #                            ),
         
         TravelDistance_Mi_Hvrs = 
                              # ifelse((is.na(Longitude_L1) | is.na(Latitude_L1)
                              #        ),
                              #        NA,
                              distHaversine(cbind(Longitude_L1, Latitude_L1),
                                            cbind(Longitude, Latitude)
                                           ) * 0.000621371, # 0.000621371 miles = 1 meter
         
         # accounting for potential negative times
         TravelTime_Sec = as.numeric(ifelse(Event_Time > lag(Departure_Time),
                                            Event_Time - lag(Departure_Time),
                                            NA
                                           )
                                    ),
         TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
         
         # accounting for potential negative or zero travel times
         SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
                               TravelDistance_Mi / TravelTime_Hr,
                               NA
                              ),
         
         Start_ID = lag(StopID_Clean),
         Start_Desc = lag(Stop_Desc),
         StartStop_ID = ifelse(is.na(Start_ID),
                               paste("NULL", StopID_Clean, sep = "--"),
                               paste(Start_ID, StopID_Clean, sep = "--")
                              )
        ) %>% 
  as.data.frame()


rm(AllDays_AddVars)
str(AllDays_BusDay)

# summary(AllDays_BusDay)

# View(tail(AllDays_BusDay, 50))

Inspecting for issues with StartStop_ID (where the value is either NA or contains NULL). They ONLY exist when BusDay_EventNum = 1 (which is by design). So everything looks OK.


View(group_by(AllDays_BusDay,
              StartStop_ID
             ) %>% 
       summarise(
         Cnt = n()
       ) %>% 
       arrange(desc(Cnt)
              )
    )

View(filter(AllDays_BusDay,
            (is.na(StartStop_ID) |
              str_detect(StartStop_ID, "NULL")
            ) &
              BusDay_EventNum != 1
           )
    )

Stats (quantiles) overall for TravelDistance_Mi.


Quantiles_dt <- AllDays_BusDay %>% 
  mutate(TD_Mi_q2 = quantile(x = TravelDistance_Mi, probs = 0.02, na.rm = TRUE),
         TD_Mi_q98 = quantile(x = TravelDistance_Mi, probs = 0.98, na.rm = TRUE),
         TT_Sec_q2 = quantile(x = TravelTime_Sec, probs = 0.02, na.rm = TRUE),
         TT_Sec_q98 = quantile(x = TravelTime_Sec, probs = 0.98, na.rm = TRUE),
         TT_Hr_q2 = quantile(x = TravelTime_Hr, probs = 0.02, na.rm = TRUE),
         TT_Hr_q98 = quantile(x = TravelTime_Hr, probs = 0.98, na.rm = TRUE)
        ) %>% 
  data.table()


Stats <- Quantiles_dt %>% 
  mutate(TD_Mi_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_Mean_F = mean(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
                             na.rm = TRUE
                            ),
         TD_Mi_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_Med_F = median(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
                              na.rm = TRUE
                             ),
         TD_Mi_Cnt = sum(!is.na(TravelDistance_Mi)
                        ),
         TD_Mi_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98]
                                 )
                          ),
            
         TT_Sec_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_Mean_F = mean(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
                              na.rm = TRUE
                             ),
         TT_Sec_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_Med_F = median(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
                               na.rm = TRUE
                              ),
         TT_Sec_Cnt = sum(!is.na(TravelTime_Sec)
                         ),
         TT_Sec_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98]
                                   )
                           ),

         TT_Hr_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_Mean_F = mean(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
                             na.rm = TRUE
                            ),
         TT_Hr_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_Med_F = median(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
                              na.rm = TRUE
                             ),
         TT_Hr_Cnt = sum(!is.na(TravelTime_Hr)
                        ),
         TT_Hr_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98]
                                 )
                          )
        ) %>% 
  data.frame()

rm(AllDays_BusDay)
rm(Quantiles_dt)
str(Stats)
# View(head(Stats, 50))

Stats for StartStop_ID.


Quantiles_SS_dt <- group_by(Stats,
                            StartStop_ID
                           ) %>% 
  mutate(TD_Mi_SS_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
         TD_Mi_SS_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
         TT_Sec_SS_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
         TT_Sec_SS_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
         TT_Hr_SS_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
         TT_Hr_SS_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
        ) %>% 
  data.table()


Stats_StSt <- group_by(Quantiles_SS_dt,
                       StartStop_ID
                      ) %>% 
  mutate(TD_Mi_SS_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SS_Mean_F = mean(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
                                na.rm = TRUE
                               ),
         TD_Mi_SS_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SS_Med_F = median(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
                                 na.rm = TRUE
                                ),
         TD_Mi_SS_Cnt = sum(!is.na(TravelDistance_Mi)
                           ),
         TD_Mi_SS_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95]
                                    )
                             ),
            
         TT_Sec_SS_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SS_Mean_F = mean(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
                                 na.rm = TRUE
                                ),
         TT_Sec_SS_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SS_Med_F = median(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
                                  na.rm = TRUE
                                 ),
         TT_Sec_SS_Cnt = sum(!is.na(TravelTime_Sec)),
         TT_Sec_SS_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95]
                                     )
                              ),

         TT_Hr_SS_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SS_Mean_F = mean(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
                                na.rm = TRUE
                               ),
         TT_Hr_SS_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SS_Med_F = median(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
                                 na.rm = TRUE
                                ),
         TT_Hr_SS_Cnt = sum(!is.na(TravelTime_Hr)),
         TT_Hr_SS_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95]
                                    )
                             )
        ) %>% 
  data.frame()

rm(Stats)
rm(Quantiles_SS_dt)
str(Stats_StSt)
# View(head(Stats_StSt, 50))

Stats for StartStop_ID with Event_Time_HrGroup.


Quantiles_SSHG_dt <- group_by(Stats_StSt,
                              StartStop_ID,
                              Event_Time_HrGroup
                             ) %>% 
  mutate(TD_Mi_SSHG_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
         TD_Mi_SSHG_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
         TT_Sec_SSHG_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
         TT_Sec_SSHG_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
         TT_Hr_SSHG_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
         TT_Hr_SSHG_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
        ) %>% 
  data.table()


Stats_StSt_HrGrp <- group_by(Quantiles_SSHG_dt,
                             StartStop_ID,
                             Event_Time_HrGroup
                            ) %>% 
  mutate(TD_Mi_SSHG_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SSHG_Mean_F = mean(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
                                  na.rm = TRUE
                                 ),
         TD_Mi_SSHG_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SSHG_Med_F = median(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TD_Mi_SSHG_Cnt = sum(!is.na(TravelDistance_Mi)
                             ),
         TD_Mi_SSHG_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95]
                                      )
                               ),
            
         TT_Sec_SSHG_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SSHG_Mean_F = mean(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TT_Sec_SSHG_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SSHG_Med_F = median(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
                                    na.rm = TRUE
                                   ),
         TT_Sec_SSHG_Cnt = sum(!is.na(TravelTime_Sec)),
         TT_Sec_SSHG_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95]
                                       )
                                ),

         TT_Hr_SSHG_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SSHG_Mean_F = mean(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
                                  na.rm = TRUE
                                 ),
         TT_Hr_SSHG_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SSHG_Med_F = median(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TT_Hr_SSHG_Cnt = sum(!is.na(TravelTime_Hr)),
         TT_Hr_SSHG_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95]
                                      )
                               )
        ) %>% 
  data.frame()

rm(Stats_StSt)
rm(Quantiles_SSHG_dt)
str(Stats_StSt_HrGrp)
# View(head(Stats_StSt_HrGrp, 50))

Feature engineering.

Creating a BusEventRoute row number, and a RouteAlt_Lag1 indicator for future identification purposes.


# rm(Quantiles_dt)
# rm(Quantiles_SS_dt)
# rm(AllDays_BusDay)
# rm(Quantiles_SSHG_dt)
# rm(Stats_StSt)

# AllDays_BusDayRoute <- group_by(Stats_StSt_HrGrp,
#                                 Bus_ID,
#                                 Event_Time_Date,
#                                 Route
#                                ) %>% 
#   mutate(RouteAlt_Lag2 = lag(RouteAlt)  # used in future analyses to identify RouteAlt (direction) changes
#          
#          # Odometer_Distance_Lag1 = lag(Odometer_Distance),
#          # 
#          # # accounting for potential negative distances
#          # TravelDistance_Ft = ifelse(Odometer_Distance >= Odometer_Distance_Lag1,
#          #                            Odometer_Distance - Odometer_Distance_Lag1,
#          #                            NA
#          #                           ),
#          # TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
#          # 
#          # # accounting for potential negative times
#          # TravelTime_Sec = as.numeric(ifelse(Event_Time >= lag(Departure_Time),
#          #                                    Event_Time - lag(Departure_Time),
#          #                                    NA
#          #                                   )
#          #                            ),
#          # TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
#          # 
#          # # accounting for potential negative or zero travel times
#          # SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
#          #                       TravelDistance_Mi / TravelTime_Hr,
#          #                       NA
#          #                      )
#         ) %>% 
#   data.frame()
# 
# rm(Stats_StSt_HrGrp)
# str(AllDays_BusDayRoute)

Feature engineering.

Calculating a variable to know if the RouteAlt changed. Could be useful in helping identifying weirdness in calculated distances and speeds.


# rm(Stats_StSt_HrGrp)

AllDays_DirChange <- Stats_StSt_HrGrp %>%  # AllDays_BusDayRoute %>% 
  mutate(RteChange = ifelse(Route == Route_Lag1,
                            "Same",
                            "Change"
                           ),
         RteChange2 = factor(ifelse(is.na(RteChange),
                                    "Change",
                                    RteChange
                                   )
                            ),
         DirChange = ifelse(RouteAlt == RouteAlt_Lag1,
                            "Same",
                            "Change"
                           ),
         DirChange2 = factor(ifelse(is.na(DirChange),
                                    "Change",
                                    DirChange
                                   )
                            )
        )

# rm(AllDays_BusDayRoute)
rm(Stats_StSt_HrGrp)
str(AllDays_DirChange)

View(filter(AllDays_DirChange,
            between(RowNum_OG, 2570060, 2570080)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )

Re-ordering the variables to ease with comprehension.


AllDays_NewOrder <-  select(AllDays_DirChange,
                            RowNum_OG,
                            UniqueLatLng,
                            group,
                            StartStop_ID,
                            BusDay_EventNum,
                            Bus_ID,
                            Route,
                            RteChange2,
                            RouteAlt,
                            # RouteAlt_Lag1,
                            DirChange2,
                            Route_Direction,
                            Stop_Sequence,
                            Start_ID,
                            Start_Desc,
                            # Stop_ID,
                            StopID_Clean,
                            StopID_Indicator,
                            Stop_Desc,
                            countryCode,
                            Stop_State,
                            Stop_County,
                            Stop_City,
                            Stop_Zip,
                            Event_Type,
                            Event_Description,
                            Event_Time_Yr,
                            Event_Time_Mth,
                            Event_Time_Date,
                            Event_Time_Day,
                            Event_Time_Hr,
                            Event_Time_HrGroup,
                            Event_Time_Min,
                            Event_Time,
                            Departure_Time,
                            Dwell_Time,
                            Dwell_Time2,
                            Delta_Time,
                            Latitude,
                            Longitude,
                            Heading,
                            Odometer_Distance,
                            Odometer_Distance_Lag1,
                            Odometer_Distance_Mi,
                            TravelDistance_Ft,
                            TravelDistance_Mi,
                            TravelDistance_Mi_Hvrs,
                            TD_Mi_q2,
                            TD_Mi_q98,
                            TD_Mi_SS_q5,
                            TD_Mi_SS_q95,
                            TD_Mi_SSHG_q5,
                            TD_Mi_SSHG_q95,
                            TD_Mi_Mean,
                            TD_Mi_Mean_F,
                            TD_Mi_SS_Mean,
                            TD_Mi_SS_Mean_F,
                            TD_Mi_SSHG_Mean,
                            TD_Mi_SSHG_Mean_F,
                            TD_Mi_Med,
                            TD_Mi_Med_F,
                            TD_Mi_SS_Med,
                            TD_Mi_SS_Med_F,
                            TD_Mi_SSHG_Med,
                            TD_Mi_SSHG_Med_F,
                            TD_Mi_Cnt,
                            TD_Mi_Cnt_F,
                            TD_Mi_SS_Cnt,
                            TD_Mi_SS_Cnt_F,
                            TD_Mi_SSHG_Cnt,
                            TD_Mi_SSHG_Cnt_F,
                            TravelTime_Sec,
                            TT_Sec_q2,
                            TT_Sec_q98,
                            TT_Sec_SS_q5,
                            TT_Sec_SS_q95,
                            TT_Sec_SSHG_q5,
                            TT_Sec_SSHG_q95,
                            TT_Sec_Mean,
                            TT_Sec_Mean_F,
                            TT_Sec_SS_Mean,
                            TT_Sec_SS_Mean_F,
                            TT_Sec_SSHG_Mean,
                            TT_Sec_SSHG_Mean_F,
                            TT_Sec_Med,
                            TT_Sec_Med_F,
                            TT_Sec_SS_Med,
                            TT_Sec_SS_Med_F,
                            TT_Sec_SSHG_Med,
                            TT_Sec_SSHG_Med_F,
                            TT_Sec_Cnt,
                            TT_Sec_Cnt_F,
                            TT_Sec_SS_Cnt,
                            TT_Sec_SS_Cnt_F,
                            TT_Sec_SSHG_Cnt,
                            TT_Sec_SSHG_Cnt_F,
                            TravelTime_Hr,
                            TT_Hr_q2,
                            TT_Hr_q98,
                            TT_Hr_SS_q5,
                            TT_Hr_SS_q95,
                            TT_Hr_SSHG_q5,
                            TT_Hr_SSHG_q95,
                            TT_Hr_Mean,
                            TT_Hr_Mean_F,
                            TT_Hr_SS_Mean,
                            TT_Hr_SS_Mean_F,
                            TT_Hr_SSHG_Mean,
                            TT_Hr_SSHG_Mean_F,
                            TT_Hr_Med,
                            TT_Hr_Med_F,
                            TT_Hr_SS_Med,
                            TT_Hr_SS_Med_F,
                            TT_Hr_SSHG_Med,
                            TT_Hr_SSHG_Med_F,
                            TT_Hr_Cnt,
                            TT_Hr_Cnt_F,
                            TT_Hr_SS_Cnt,
                            TT_Hr_SS_Cnt_F,
                            TT_Hr_SSHG_Cnt,
                            TT_Hr_SSHG_Cnt_F,
                            SpeedAvg_Mph
                           )

rm(AllDays_DirChange)
str(select(AllDays_NewOrder,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(AllDays_NewOrder)

# View(head(AllDays_NewOrder, 500))
# View(tail(AllDays_NewOrder, 500))

Summarizing the data to help spot anomolies.


View(group_by(AllDays_NewOrder,
              Stop_City) %>% 
       summarise(Cnt_Num = n(),
                 Cnt_Pct = 100*Cnt_Num / (nrow(AllDays_NewOrder)
                                         )
                ) %>% 
       arrange(desc(Cnt_Num))
)

summary(AllDays_NewOrder)

Investigation of TravelDistance_Mi.

View(TravDistMi_Pctiles): 99% of TravelDistance_Mi are about 1 mile or less…but some weird TravelDistance_Mi values (e.g., 584 miles traveled) exist.


TravDistMi_Ntile <- as.data.frame(AllDays_NewOrder$TravelDistance_Mi) %>% 
  mutate(#Pctile = ntile(AllDays_NewOrder$TravelDistance_Mi, 100),
         #MinR = min_rank(AllDays_NewOrder$TravelDistance_Mi),
         PctR = percent_rank(AllDays_NewOrder$TravelDistance_Mi),
         PctR_Round = round(PctR, 2)
        ) 

colnames(TravDistMi_Ntile)[1] <- "TravelDistance_Mi"
# str(TravDistMi_Ntile)

TravDistMi_Ntile_Rows <- nrow(TravDistMi_Ntile)

# View(tail(TravDistMi_Ntile, 500))


TravDistMi_Pctiles <- group_by(TravDistMi_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinTravDistMiAtPctile = min(TravelDistance_Mi),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravDistMi_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

rm(TravDistMi_Ntile)
rm(TravDistMi_Ntile_Rows)

View(TravDistMi_Pctiles)
TravDistMi_Pctiles

Investigation of TravelDistance_Mi.

Why are some TravelDistance_Mi “NA”? It looks like partially because the records are the first trip of the day (for that bus), so I purposefully set the distance to “NA”. Another reason is due to the odometer recording a value less than the previous odometer recording. In most cases, I have no explanation for this - though I have observed about 67% of all instances where TravelDistance_Mi is NA (other than because it’s the first record of the day) are instances where DirChange2 is “Change”. This is weird and should be asked to WMATA.


# View(head(AllDays_NewOrder, 500))

View(filter(AllDays_NewOrder,
            BusDay_EventNum != 1 # When BusDay_EventNum == 1, TravelDistance_Mi is NA by design (don't want to calculate distance based on yesterday's position)
           ) %>% 
       group_by(StartStop_ID) %>% 
       summarise(Cnts = sum(is.na(TravelDistance_Mi)
                           )
                ) %>% 
       arrange(desc(Cnts)
              )
    )

View(filter(AllDays_NewOrder,
            StartStop_ID == "1000245--1000211"
           ) %>% 
       select(RowNum_OG,
              StartStop_ID,
              Event_Time,
              Event_Time_HrGroup,
              Bus_ID,
              TravelDistance_Mi,
              TravelDistance_Mi_Hvrs,
              TD_Mi_SS_Mean,
              TD_Mi_SS_Mean_F,
              TD_Mi_SSHG_Mean,
              TD_Mi_SSHG_Mean_F,
              TD_Mi_SS_Med,
              TD_Mi_SS_Med_F,
              TD_Mi_SSHG_Med,
              TD_Mi_SSHG_Med_F,
              TD_Mi_SS_Cnt,
              TD_Mi_SS_Cnt_F,
              TD_Mi_SSHG_Cnt,
              TD_Mi_SSHG_Cnt_F
              ) %>% 
       mutate(Ratio_MeanToHvrs = TD_Mi_SS_Mean / TravelDistance_Mi_Hvrs) %>% 
       arrange(Event_Time)
    )

View(filter(AllDays_NewOrder,
            is.na(TravelDistance_Mi)
           )
    )

# These records are NA becuase the record is the first record of the day (the Event_Time_Date)
View(filter(AllDays_NewOrder,
            between(RowNum_OG, 326, 346) | # 336
              between(RowNum_OG, 591, 611) | # 601
              between(RowNum_OG, 845, 865) # 855
           )
    )

Investigation of TravelDistance_Mi.

These records are NA becuase the current record odometer is less than the previous record odometer. Theoretically, this should NOT happen. Me: it appears that about 67% of all instances where TravelDistance_Mi is NA (other than because it’s th first record of the day) are instances where DirChange2 is “Change”. This is weird and should be asked to WMATA.


View(filter(AllDays_NewOrder,
            between(RowNum_OG, 194, 214) | # 204
              between(RowNum_OG, 440, 460) | # 450
              between(RowNum_OG, 478, 498) | # 488
              between(RowNum_OG, 510, 530) # 520
           )
    )

TestTable <- filter(AllDays_NewOrder,
                    BusDay_EventNum != 1
                   ) %>% 
  mutate(TravelDistance_NA = as.factor(ifelse(is.na(TravelDistance_Mi),
                                              "True",
                                              "False"
                                             )
                                      )
        ) %>%
  group_by(DirChange2, TravelDistance_NA) %>%
  summarise(TravDistMi_NACnts = n()
           )

# TestTable

TestTable_Spread <- as.data.frame(spread(TestTable,
                                         TravelDistance_NA,
                                         TravDistMi_NACnts
                                        )
                                 ) %>% 
  select(False,
         True
        )

row.names(TestTable_Spread) <- c("Change", "Same")
# str(TestTable_Spread)
# TestTable_Spread

prop.table(as.table(as.matrix(TestTable_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(TestTable_Spread)
                   ),
           2
          )

Investigation of TravelDistance_Mi.

Let’s look at just the TravelDistance_Mi values that are NOT “NA”.


rm(TestTable, TestTable_Spread)

TravelDistance_Mi_NoNA <- filter(AllDays_NewOrder,
                                 # TravelDistance_Mi != 0 &
                                 !is.na(TravelDistance_Mi)
                                )

dim(AllDays_NewOrder)
dim(TravelDistance_Mi_NoNA)
nrow(AllDays_NewOrder) - nrow(TravelDistance_Mi_NoNA)

str(TravelDistance_Mi_NoNA)
summary(TravelDistance_Mi_NoNA)

Investigation of TravelDistance_Mi.

Let’s plot just the TravelDistance_Mi values that are NOT “NA”.


TravDistMi_HistDen <- ggplot(select(TravelDistance_Mi_NoNA,
                                    TravelDistance_Mi
                                   ),
                             aes(x = TravelDistance_Mi,
                                 y = ..density..
                                )
                            ) +
  geom_histogram(binwidth = 0.05, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 4.0)
                 ) +
  labs(title = "Variation in Distance Between Stops",
       x = "Travel Distance (miles)",
       y = "Density"
      )

TravDistMi_HistDen

Investigation of TravelDistance_Mi.

Looking at the extremely large TravelDistance_Mi values. Some (aprox 27%) of TravelDistance_Mi values > 1 mile are when the DirChange2 changes…but what about the other ~73%?


rm(TravelDistance_Mi_NoNA)

# examples of weirdly large TravelDistance_Mi
View(filter(AllDays_NewOrder,
            TravelDistance_Mi > 1.1587121212 # 1.1587121212 is the 99th percentile
           ) %>% 
       arrange(desc(TravelDistance_Mi)
              )
    )


# Why are these extremes?  Airports?  Bus collection points?
View(filter(AllDays_NewOrder,
              between(RowNum_OG, 494044, 494064) | # 494054
              between(RowNum_OG, 494273, 494293) | # 494283
              between(RowNum_OG, 494626, 494646) | # 494636
              between(RowNum_OG, 1610156, 1610176) | # 1610166
              between(RowNum_OG, 2073074, 2073094) # 2073084
           )
    )

# Before Removing Runs
# View(filter(AllDays_Sorted,
#             between(RowNum_OG, 494044, 494064) | # 494054
#               between(RowNum_OG, 494273, 494293) | # 494283
#               between(RowNum_OG, 494626, 494646) | # 494636
#               between(RowNum_OG, 1610156, 1610176) | # 1610166
#               between(RowNum_OG, 2073074, 2073094) # 2073084
#            )
#     )

# After Removing Runs
# View(filter(AllDays_FirstStopID,
#             between(RowNum_OG, 494044, 494064) | # 494054
#               between(RowNum_OG, 494273, 494293) | # 494283
#               between(RowNum_OG, 494626, 494646) | # 494636
#               between(RowNum_OG, 1610156, 1610176) | # 1610166
#               between(RowNum_OG, 2073074, 2073094) # 2073084
#            )
#     )

Investigation of TravelDistance_Mi.

Any relation with DirChange2? Doesn’t look as if this is so.


ExtremeTravDist <- filter(AllDays_NewOrder,
                          !is.na(TravelDistance_Mi)
                         ) %>% 
  mutate(TravDist_Extreme = ifelse(TravelDistance_Mi > 1.1587121212, # 1.1587121212 is the 99th percentile
                                   "True",
                                   "False"
                                  )
                          ) %>% 
  group_by(DirChange2, TravDist_Extreme) %>% 
  summarise(TravDistMI_ExtCnts = n()
           )

# ExtremeTravDist


ExtremeTravDist_Spread <- as.data.frame(spread(ExtremeTravDist,
                                               TravDist_Extreme,
                                               TravDistMI_ExtCnts
                                              )
                                       ) %>% 
  select(False,
         True
        )

row.names(ExtremeTravDist_Spread) <- c("Change", "Same")
# str(ExtremeTravDist_Spread)
# ExtremeTravDist_Spread

prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
                   ),
           2
          )

Investigation of TravelDistance_Mi.

Looking at specific buses and StartStop_ID.


rm(ExtremeTravDist, ExtremeTravDist_Spread)

View(arrange(group_by(AllDays_NewOrder,
                      Bus_ID
                     ) %>% 
               summarise(DistTrav_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
                         DistTrav_Med = median(TravelDistance_Mi, na.rm = TRUE)
                        ),
             desc(DistTrav_Med)
            )
    )


# example of extremely small TravelDistance_Mi values (looks like the odometer wasn't functioning)
View(filter(AllDays_NewOrder,
            Bus_ID == 6111 |
              Bus_ID == 7201 |
              Bus_ID == 8058
           ) %>% 
       arrange(Bus_ID, Event_Time)
    )


View(arrange(group_by(AllDays_NewOrder,
                      StartStop_ID
                     ) %>% 
               summarise(DistTrav_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
                         DistTrav_Med = median(TravelDistance_Mi, na.rm = TRUE)
                        ),
             desc(DistTrav_Med)
            )
    )

# example of extremely large TravelDistance_Mi values...no idea why...
View(filter(AllDays_NewOrder,
            StartStop_ID == "1003665--12" |
              StartStop_ID == "1003665--5001925" |
              StartStop_ID == "3001038--3002565"
           ) %>% 
       arrange(StartStop_ID, Event_Time)
    )

Investigation of TravelDistance_Mi & TravelDistance_Mi_New.

If TravelDisntace_Mi is below the 5th percentile for that StartStop_ID, or if TravelDisntace_Mi is above the 95th percentile for that StartStop_ID, or if TravelDistance_Mi is NA (when the BusDay_EventNum !=1), consider this an outlier. In this case, replace the value with the mean for that StartStop_ID and HourGroup (TD_Mi_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.


# View(tail(AllDays_NewOrder, 500))

AllDays_NewTravelDist <- 
  mutate(AllDays_NewOrder,
         TravelDistance_Mi_New = ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SSHG_Cnt_F >= 20,
                                        TD_Mi_SSHG_Mean_F,
                                 ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SSHG_Cnt_F < 20 &
                                          TD_Mi_SS_Cnt_F >= 20,
                                        TD_Mi_SS_Mean_F,
                                 ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SS_Cnt_F < 20 &
                                          TD_Mi_SS_Cnt >= 20,
                                        TD_Mi_SS_Mean,
                                 ifelse(is.na(TravelDistance_Mi) &
                                          BusDay_EventNum != 1 &
                                          TravelDistance_Mi_Hvrs != 0,
                                        TravelDistance_Mi_Hvrs,
                                 ifelse(is.na(TravelDistance_Mi) &
                                          BusDay_EventNum != 1 &
                                          TravelDistance_Mi_Hvrs == 0,
                                        TD_Mi_SS_Mean,
                                        TravelDistance_Mi
                                       ))))),
         TravelDistance_Mi_New_Label = 
           factor(ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SSHG_Cnt_F >= 20,
                         "TD_Mi_SSHG_Mean_F",
                  ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SSHG_Cnt_F < 20 &
                           TD_Mi_SS_Cnt_F >= 20,
                         "TD_Mi_SS_Mean_F",
                  ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SS_Cnt_F < 20 &
                           TD_Mi_SS_Cnt >= 20,
                         "TD_Mi_SS_Mean",
                  ifelse(is.na(TravelDistance_Mi) &
                           BusDay_EventNum != 1 &
                           TravelDistance_Mi_Hvrs != 0,
                         "TravelDistance_Mi_Hvrs",
                  ifelse(is.na(TravelDistance_Mi) &
                           BusDay_EventNum != 1 &
                           TravelDistance_Mi_Hvrs == 0,
                         "TD_Mi_SS_Mean",
                         "TravelDistance_Mi"
                        )))))
                 ),
         TravelDistance_Mi_NewHvrs = ifelse(!is.na(TravelDistance_Mi_Hvrs) &
                                              TravelDistance_Mi_Hvrs != 0 &
                                              (TravelDistance_Mi_New < TD_Mi_q2 |
                                                 TravelDistance_Mi_New > TD_Mi_q98
                                              ),
                                            TravelDistance_Mi_Hvrs,
                                            TravelDistance_Mi_New
                                           ),
         TravelDistance_Mi_NewHvrs_Label =
           factor(ifelse(!is.na(TravelDistance_Mi_Hvrs) &
                           TravelDistance_Mi_Hvrs != 0 &
                           (TravelDistance_Mi_New < TD_Mi_q2 |
                              TravelDistance_Mi_New > TD_Mi_q98
                           ),
                         "TravelDistance_Mi_Hvrs",
                         as.character(TravelDistance_Mi_New_Label)
                        )
                 ),
         SpeedAvg_Mph_NewHvrs = TravelDistance_Mi_NewHvrs / TravelTime_Hr
        )

rm(AllDays_NewOrder)
str(AllDays_NewTravelDist)

Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Quick summary and then correlation calculation.


# 38 rows meet this criteria anymore  --  appears to be the case when both the Lat Long calculations, and the TravelDistance calculations did not function properly.
View(filter(AllDays_NewTravelDist,
            is.na(TravelDistance_Mi_New) &
              BusDay_EventNum != 1
           )
    )

View(AllDays_NewTravelDist %>% 
       arrange(desc(TravelDistance_Mi_New)) %>% 
       head(500)
    )

summary(select(AllDays_NewTravelDist,
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      BusDay_EventNum != 1
                     ),
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )


cor(select(AllDays_NewTravelDist,
           TravelDistance_Mi,
           TravelDistance_Mi_Hvrs,
           TravelDistance_Mi_New,
           TravelDistance_Mi_NewHvrs
          ),
    use = "pairwise.complete.obs"
  )

Investigation of TravelDistance_Mi_NewHvrs_Label & TravelDistance_Mi_NewHvrs_Label.

Show how the labels changed.


group_by(AllDays_NewTravelDist,
         TravelDistance_Mi_New_Label,
         TravelDistance_Mi_NewHvrs_Label
        ) %>% 
  summarise(CntNum = n(),
            CntPct = format(CntNum / nrow(AllDays_NewTravelDist),
                            scientific = 9999
                           )
           ) %>% 
  arrange(desc(CntPct)
         )

Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Graphing the two methods of calculating TravelDistance_Mi.

First, let’s get create a function to plot the liner model equation.


lm_eqn <- function(df, y, x){
  m <- lm(y ~ x, df)
  
  l <- list(a = format(coef(m)[1], digits = 2),
            b = format(abs(coef(m)[2]), digits = 2),
            s1 = ifelse(test = coef(m)[2] > 0,
                        yes = "+",
                        no = "-"
                       ),
            r2 = format(summary(m)$r.squared,
                        digits = 3
                       )
           )
  
  eq <- substitute(italic(y) == a~~s1~~b %.% italic(x)*","~~italic(r)^2~"="~r2,
                   l
                  )
  
  as.character(as.expression(eq)
              )             
}

Investigation of TravelDistance_Mi & TravelDistance_Mi_NewHvrs.

Scatter plot (using a 10% sample to making plotting time faster and to reduce un-needed data in the “same” splot).


set.seed(123456789)
AllDays_NewTravelDist_10Pct <- filter(AllDays_NewTravelDist,
                                      !is.na(TravelDistance_Mi_NewHvrs) &
                                        !is.na(TravelDistance_Mi)
                                     ) %>% 
  rename(DistMethod = TravelDistance_Mi_NewHvrs_Label) %>% 
  sample_frac(0.1)


TravDist_MiVsCalc <- ggplot(select(AllDays_NewTravelDist_10Pct,
                                   TravelDistance_Mi_NewHvrs,
                                   TravelDistance_Mi,
                                   DistMethod
                                  ),
                            aes(x = TravelDistance_Mi,
                                y = TravelDistance_Mi_NewHvrs,
                                colour = DistMethod
                               )
                           ) +
  scale_colour_manual(values = c("red","blue", "green", "orange", "black")
                     ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "blue") +
  geom_abline(intercept = 0, slope = 1, colour = "red") +
  coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 1.5)
                 ) +
  scale_x_continuous(breaks = seq(0, 1.5, 0.25)
                    ) +
  scale_y_continuous(breaks = seq(0, 1.5, 0.25)
                    ) +
  theme(legend.position = "bottom", #c(0.85, 0.40),
        legend.text = element_text(size = 6)
       ) +
  annotate(label = lm_eqn(df = AllDays_NewTravelDist_10Pct,
                          x = AllDays_NewTravelDist_10Pct$TravelDistance_Mi,
                          y = AllDays_NewTravelDist_10Pct$TravelDistance_Mi_NewHvrs
                         ),
           # x = 62,
           # y = 20,
           x = 0.70,
           y = 0.00,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  annotate(label = "Reference Line (slope = 1)",
           # x = 16,
           # y = 30,
           x = 0.80,
           y = 1.05,
           geom = "text",
           size = 3,
           colour = "red"
          ) +
  labs(title = "TravelDistance_Mi vs. TravelDistance_Mi_NewHvrs",
       x = "TravelDistance_Mi",
       y = "TravelDistance_Mi_NewHvrs"
      )
# +
#   geom_jitter()

TravDist_MiVsCalc

Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Graphing test with rbokeh.


TravDist_MiVsCalc_Bokeh <- figure(data = select(AllDays_NewTravelDist_10Pct,
                                                TravelDistance_Mi_NewHvrs,
                                                TravelDistance_Mi,
                                                DistMethod
                                               ),
                                  xlim = c(0, 1.5),
                                  ylim = c(0, 1.5),
                                  legend_location = "bottom_right"
                                 ) %>% 
  ly_points(x = TravelDistance_Mi,
            y = TravelDistance_Mi_NewHvrs,
            color = DistMethod,
            hover = c(TravelDistance_Mi_NewHvrs, TravelDistance_Mi, DistMethod)
           ) %>% 
  ly_abline(a = 0, b = 1, color = "red")

TravDist_MiVsCalc_Bokeh

Investigation of TravelDistance_Mi_New.

Calculating the minimum TravelDistance_Mi_New value at each percentile.


rm(TravDist_MiVsCalc_Bokeh)
rm(AllDays_NewTravelDist_10Pct)


summary(select(AllDays_NewTravelDist,
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      BusDay_EventNum != 1
                     ),
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )


TravDistMiN_Ntile <- as.data.frame(select(AllDays_NewTravelDist,
                                          StartStop_ID,
                                          TravelDistance_Mi_New_Label,
                                          # TravelDistance_Mi_NewHvrs_Label,
                                          TravelDistance_Mi_New
                                          # TravelDistance_Mi_NewHvrs
                                         )
                                  ) %>% 
  mutate(PctR_N = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_New),
         # PctR_H = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_NewHvrs),
         PctR_Round_N = round(PctR_N, 2)
         # PctR_Round_H = round(PctR_H, 2)
        ) 

# str(TravDistMiN_Ntile)
# View(head(TravDistMiN_Ntile, 500))

TravDistMiN_Ntile_Rows <- nrow(TravDistMiN_Ntile)

# View(tail(TravDistMiN_Ntile, 500))


TravDistMiN_Pctiles <- group_by(TravDistMiN_Ntile,
                                PctR_Round_N
                               ) %>% 
  summarise(
    MinTDMiAtPctile_N = min(TravelDistance_Mi_New),
    # MinTDMiAtPctile_H = min(TravelDistance_Mi_NewHvrs),
    CntsAtPctile_N = sum(!is.na(TravelDistance_Mi_New)),
    # CntsAtPctile_H = sum(!is.na(TravelDistance_Mi_NewHvrs)),
    PctsAtPctile_N = CntsAtPctile_N / TravDistMiN_Ntile_Rows
    # PctsAtPctile_H = CntsAtPctile_H / TravDistMiN_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP_N = cumsum(PctsAtPctile_N)
         # CumSumPAtP_H = cumsum(PctsAtPctile_H)
        )

# View(TravDistMiN_Pctiles)
TravDistMiN_Pctiles

Investigation of TravelDistance_Mi_NewHvrs

Calculating the minimum TravelDistance_Mi_NewHvrs value at each percentile.


TravDistMiH_Ntile <- as.data.frame(select(AllDays_NewTravelDist,
                                          StartStop_ID,
                                          # TravelDistance_Mi_New_Label,
                                          TravelDistance_Mi_NewHvrs_Label,
                                          # TravelDistance_Mi_New,
                                          TravelDistance_Mi_NewHvrs
                                         )
                                  ) %>% 
  mutate(# PctR_N = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_New),
         PctR_H = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_NewHvrs),
         # PctR_Round_N = round(PctR_N, 2),
         PctR_Round_H = round(PctR_H, 2)
        ) 

# str(TravDistMiH_Ntile)
# View(head(TravDistMiH_Ntile, 500))

TravDistMiH_Ntile_Rows <- nrow(TravDistMiH_Ntile)

# View(tail(TravDistMiH_Ntile, 500))


TravDistMiH_Pctiles <- group_by(TravDistMiH_Ntile,
                                PctR_Round_H
                               ) %>% 
  summarise(
    # MinTDMiAtPctile_N = min(TravelDistance_Mi_New),
    MinTDMiAtPctile_H = min(TravelDistance_Mi_NewHvrs),
    # CntsAtPctile_N = sum(!is.na(TravelDistance_Mi_New)),
    CntsAtPctile_H = sum(!is.na(TravelDistance_Mi_NewHvrs)),
    # PctsAtPctile_N = CntsAtPctile_N / TravDistMiH_Ntile_Rows,
    PctsAtPctile_H = CntsAtPctile_H / TravDistMiH_Ntile_Rows
  ) %>% 
  mutate(# CumSumPAtP_N = cumsum(PctsAtPctile_N),
         CumSumPAtP_H = cumsum(PctsAtPctile_H)
        )

# View(TravDistMiH_Pctiles)
TravDistMiH_Pctiles

Join TravDistMiH_Pctiles, TravDistMiN_Pctiles, and TravDistMi_Pctiles.

~11% of rides are still showing as less than 0.1 miles of TravelDistance_Mi_NewHvrs.


rm(TravDistMiN_Ntile_Rows, TravDistMiH_Ntile_Rows, TravDistMiN_Ntile, TravDistMiH_Ntile)


# View(TravDistMi_Pctiles)
# View(TravDistMiN_Pctiles)
# View(TravDistMiH_Pctiles)

TravDistMi_Pctiles_All <- inner_join(x = TravDistMi_Pctiles,
                                     y = TravDistMiN_Pctiles,
                                     by = c("PctR_Round" = "PctR_Round_N")
                                    ) %>% 
  inner_join(y = TravDistMiH_Pctiles,
             by = c("PctR_Round" = "PctR_Round_H")
            ) %>% 
  select(PctR_Round,
         MinTravDistMiAtPctile,
         MinTDMiAtPctile_N,
         MinTDMiAtPctile_H,
         CntsAtPctile,
         CntsAtPctile_N,
         CntsAtPctile_H,
         PctsAtPctile,
         PctsAtPctile_N,
         PctsAtPctile_H,
         CumSumPAtP,
         CumSumPAtP_N,
         CumSumPAtP_H
         )

# str(TravDistMi_Pctiles_All)

rm(TravDistMi_Pctiles, TravDistMiN_Pctiles,TravDistMiH_Pctiles)


View(TravDistMi_Pctiles_All)
TravDistMi_Pctiles_All

Investigation of TravelDistance_Mi_New.

Why are there still some small or large TravelDistance_Mi_NewHvrs values.


# View(filter(AllDays_NewTravelDist,
#             !is.na(TravelDistance_Mi_NewHvrs)
#            ) %>% 
#        select(-TD_Mi_q2,
#               -TD_Mi_q98,
#               -TD_Mi_SS_q5,
#               -TD_Mi_SS_q95,
#               -TD_Mi_SSHG_q5,
#               -TD_Mi_SSHG_q95,
#               -TD_Mi_Mean,
#               -TD_Mi_Mean_F,
#               -TD_Mi_SS_Mean,
#               -TD_Mi_SS_Mean_F,
#               -TD_Mi_SSHG_Mean,
#               -TD_Mi_SSHG_Mean_F,
#               -TD_Mi_Med,
#               -TD_Mi_Med_F,
#               -TD_Mi_SS_Med,
#               -TD_Mi_SS_Med_F,
#               -TD_Mi_SSHG_Med,
#               -TD_Mi_SSHG_Med_F,
#               -TD_Mi_Cnt,
#               -TD_Mi_Cnt_F,
#               -TD_Mi_SS_Cnt,
#               -TD_Mi_SS_Cnt_F,
#               -TD_Mi_SSHG_Cnt,
#               -TD_Mi_SSHG_Cnt_F,
#               -TT_Sec_q2,
#               -TT_Sec_q98,
#               -TT_Sec_SS_q5,
#               -TT_Sec_SS_q95,
#               -TT_Sec_SSHG_q5,
#               -TT_Sec_SSHG_q95,
#               -TT_Sec_Mean,
#               -TT_Sec_Mean_F,
#               -TT_Sec_SS_Mean,
#               -TT_Sec_SS_Mean_F,
#               -TT_Sec_SSHG_Mean,
#               -TT_Sec_SSHG_Mean_F,
#               -TT_Sec_Med,
#               -TT_Sec_Med_F,
#               -TT_Sec_SS_Med,
#               -TT_Sec_SS_Med_F,
#               -TT_Sec_SSHG_Med,
#               -TT_Sec_SSHG_Med_F,
#               -TT_Sec_Cnt,
#               -TT_Sec_Cnt_F,
#               -TT_Sec_SS_Cnt,
#               -TT_Sec_SS_Cnt_F,
#               -TT_Sec_SSHG_Cnt,
#               -TT_Sec_SSHG_Cnt_F,
#               -TT_Hr_q2,
#               -TT_Hr_q98,
#               -TT_Hr_SS_q5,
#               -TT_Hr_SS_q95,
#               -TT_Hr_SSHG_q5,
#               -TT_Hr_SSHG_q95,
#               -TT_Hr_Mean,
#               -TT_Hr_Mean_F,
#               -TT_Hr_SS_Mean,
#               -TT_Hr_SS_Mean_F,
#               -TT_Hr_SSHG_Mean,
#               -TT_Hr_SSHG_Mean_F,
#               -TT_Hr_Med,
#               -TT_Hr_Med_F,
#               -TT_Hr_SS_Med,
#               -TT_Hr_SS_Med_F,
#               -TT_Hr_SSHG_Med,
#               -TT_Hr_SSHG_Med_F,
#               -TT_Hr_Cnt,
#               -TT_Hr_Cnt_F,
#               -TT_Hr_SS_Cnt,
#               -TT_Hr_SS_Cnt_F,
#               -TT_Hr_SSHG_Cnt,
#               -TT_Hr_SSHG_Cnt_F
#              ) %>% 
#        arrange(TravelDistance_Mi_NewHvrs) %>% 
#        head(500)
#     )

View(filter(AllDays_NewTravelDist,
            !is.na(TravelDistance_Mi_NewHvrs)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             ) %>% 
       arrange(TravelDistance_Mi_NewHvrs) %>%
       head(500)
    )

# examples of the smallest TravelDistance_Mi_NewHvrs values.
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1424440 & RowNum_OG <= 1424460) | # 1424450  --  direction change
                (RowNum_OG >= 763292 & RowNum_OG <= 763312) | # 763302  --  direction change
                (RowNum_OG >= 1679093 & RowNum_OG <= 1679113) | # 1679103  --  direction change
                (RowNum_OG >= 2860918 & RowNum_OG <= 2860938) # 2860928  --  looks correct
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )


View(filter(AllDays_NewTravelDist,
            !is.na(TravelDistance_Mi_NewHvrs)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             ) %>% 
       arrange(desc(TravelDistance_Mi_NewHvrs)
              ) %>%
       head(500)
    )

# examples of the largest TravelDistance_Mi_NewHvrs values.
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1092000 & RowNum_OG <= 1092050) | # 1092030  --  direction change
                (RowNum_OG >= 1609460 & RowNum_OG <= 1609480) | # 1609470  -- direction change 
                (RowNum_OG >= 508904 & RowNum_OG <= 508924) | # 508914  --  direction change & original StopID was bad
                (RowNum_OG >= 2476345 & RowNum_OG <= 2476365) # 2476355  --  direction change
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )

Investigation of TravelTime_Hr.

View(TravDistMi_Pctiles): 98% of TravelTime_Hr are between 7 seconds and 464 seconds (~8 minutes).


TravTimeHr_Ntile <- select(AllDays_NewTravelDist,
                           TravelTime_Hr
                          ) %>% 
  mutate(# Pctile = ntile(AllDays_NewTravelDist$TravelTime_Hr, 100),
         # MinR = min_rank(AllDays_NewTravelDist$TravelTime_Hr),
         PctR = percent_rank(AllDays_NewTravelDist$TravelTime_Hr),
         PctR_Round = round(PctR, 2)
        ) 

# str(TravTimeHr_Ntile)

TravTimeHr_Ntile_Rows <- nrow(TravTimeHr_Ntile)

# View(tail(TravTimeHr_Ntile, 500))


TravTimeHr_Pctiles <- group_by(TravTimeHr_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinTravTimeHrAtPctile = min(TravelTime_Hr),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravTimeHr_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile),
         MinTravTimeSecAtPctile = MinTravTimeHrAtPctile * 3600
        )

rm(TravTimeHr_Ntile_Rows)
rm(TravTimeHr_Ntile)
View(TravTimeHr_Pctiles)
TravTimeHr_Pctiles

Investigation of TravelTime_Hr.

Histogram of TravelTime_Sec.


TravTime_Sec_HistDen <- ggplot(filter(select(AllDays_NewTravelDist,
                                             TravelTime_Sec
                                            ),
                                      !is.na(TravelTime_Sec)
                                     ),
                               aes(x = TravelTime_Sec,
                                   y = ..density..
                                  )
                          ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  # stat_bin(binwidth = 5,
  #          geom = "text",
  #          size = 2.5,
  #          vjust = 1.5,
  #          aes(label = format(..count.., big.mark = ",")
  #             ),
  #         ) +
  coord_cartesian(xlim = c(0, 180), ylim = c(0, 0.02)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Time",
       x = "Travel Time (sec)",
       y = "Density"
      )

TravTime_Sec_HistDen

Investigation of TravelTime_Sec.

TravelTime_Sec values are NA.


summary(AllDays_NewTravelDist$TravelTime_Sec)


View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(is.na(TravelTime_Sec) &
                BusDay_EventNum != 1  # TravelTime purposefully not calculated here
             )
    )

# examples of TravelTime_Sec values that are NA. These are NA because the Event_Time & Departure_Time readings are not accurate (i.e., the previous Departure_Time is BEFORE or EQUAL TO the current Event_Time).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 90809 & RowNum_OG <= 90829) | # 90819
                (RowNum_OG >= 90881 & RowNum_OG <= 90901) | # 90891
                (RowNum_OG >= 2597066 & RowNum_OG <= 2597086) | # 2597076
                (RowNum_OG >= 2613305 & RowNum_OG <= 2613325) # 2613315
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt"))
    )

Investigation of TravelTime_Sec.

TravelTime_Sec values are extremely small.


View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(!is.na(TravelTime_Sec)
             ) %>% 
       arrange(TravelTime_Sec,
               desc(SpeedAvg_Mph_NewHvrs)
              ) %>%
       head(500)
    )

# examples where TravelTime_Sec is small (1 sec) and SpeedAvg_Mph_NewHvrs is large.
View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter((RowNum_OG >= 2217353 & RowNum_OG <= 2217373) | # 2217363
                (RowNum_OG >= 3090321 & RowNum_OG <= 3090341) | # 3090331
                (RowNum_OG >= 80764 & RowNum_OG <= 80784) | # 80774
                (RowNum_OG >= 33840 & RowNum_OG <= 33860) # 33850
           )
    )

Investigation of TravelTime_Sec.

TravelTime_Sec values are extremely large.


View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(!is.na(TravelTime_Sec)
             ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )

# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter((RowNum_OG >= 1007703 & RowNum_OG <= 1007723) | # 1007713
                (RowNum_OG >= 2373564 & RowNum_OG <= 2373584) | # 2373574
                (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
                (RowNum_OG >= 2570060 & RowNum_OG <= 2570080) # 2570070
           )
    )

Investigation of TravelTime_Sec.

Are large TravelTime_Sec values related to RouteChanges? Looks likely. When the Bus involves a Route “change”, there is almost twice as likely to be a case of an outlier TravelTime_Sec value (on the high side).


TTLargeRteChng <- select(AllDays_NewTravelDist,
                         -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
                        ) %>% 
  mutate(TT_Out = factor(ifelse(TravelTime_Sec > 464,  # this is the 99th percentile
                                "Outlier",
                                "Normal"
                               )
                        )
        )

# str(TTLargeRteChng)


TTLargeRteChng_Cnts <- group_by(TTLargeRteChng,
                                RteChange2,
                                TT_Out
                               ) %>% 
  summarise(Cnts = n()
           )

TTLargeRteChng_Spread <- as.data.frame(spread(TTLargeRteChng_Cnts,
                                              TT_Out,
                                              Cnts
                                             )
                                      ) %>%
  select(-RteChange2)

row.names(TTLargeRteChng_Spread) <- c("Change", "Same")
# str(TTLargeRteChng_Spread)


# When the Bus involves a Route "change", there is almost twice as likely to be a case of an outlier TravelTime_Sec value.
TTLargeRteChng_Spread
prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
                   ),
           2
          )

# rm(TTLargeRteChng, TTLargeRteChng_Spread)
         

Investigation of TravelTime_Sec.

Are large TravelTime_Sec values related to RouteChanges? Looks likely.


View(filter(TTLargeRteChng,
            !is.na(TravelTime_Sec) &
              RteChange2 == "Same"
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )


# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(filter(TTLargeRteChng,
            (RowNum_OG >= 2250290 & RowNum_OG <= 2250310) | # 2250300
              (RowNum_OG >= 867717 & RowNum_OG <= 867737) | # 867727
              (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
              (RowNum_OG >= 808395 & RowNum_OG <= 808415) # 808405
           )
    )

Investigation of TravelTime_Sec.

If TravelTime_Sec is below the 5th percentile for that StartStop_ID, or if TravelTime_Sec is above the 95th percentile for that StartStop_ID, consider this an outlier. In this case, replace the value with the mean for that StartStop_ID and HourGroup (TT_Sec_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.


rm(TTLargeRteChng, TTLargeRteChng_Cnts, TTLargeRteChng_Spread)


NewTravTime <- mutate(AllDays_NewTravelDist,
                      TT_Sec_New = ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SSHG_Cnt_F >= 20,
                                          TT_Sec_SSHG_Mean_F,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SSHG_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt_F >= 20,
                                          TT_Sec_SS_Mean_F,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SS_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt >= 20,
                                          TT_Sec_SS_Mean,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SS_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt < 20 &
                                            RteChange2 == "Change",
                                          NA,
                                          TravelTime_Sec
                                         )))),
                      
                      TT_Sec_New_Label = 
           factor(ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SSHG_Cnt_F >= 20,
                         "TT_Sec_SSHG_Mean_F",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SSHG_Cnt_F < 20 &
                           TT_Sec_SS_Cnt_F >= 20,
                         "TT_Sec_SS_Mean_F",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                            ) &
                           TT_Sec_SS_Cnt_F < 20 &
                           TT_Sec_SS_Cnt >= 20,
                         "TT_Sec_SS_Mean",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SS_Cnt_F < 20 &
                           TT_Sec_SS_Cnt < 20 &
                           RteChange2 == "Change",
                         NA,
                         "TravelTime_Sec"
                        ))))
                 ),
                  
                  TT_Hr_New = TT_Sec_New / (60 * 60)
           )


dim(AllDays_NewTravelDist)
dim(NewTravTime)
rm(AllDays_NewTravelDist)

summary(select(NewTravTime,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )

str(select(NewTravTime,
           TravelTime_Sec,
           TT_Sec_New,
           TT_Sec_New_Label,
           TT_Hr_New
          )
   )


summary(select(NewTravTime,
               TravelTime_Sec,
               TT_Sec_New,
               TT_Sec_New_Label,
               TT_Hr_New
              )
       )

Test investigation of just the X2 Route. Box plots for time between bus arrivals (by HourGroup).


View(head(select(NewTravTime,
                 -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
                )
         )
    )

X2 <- select(NewTravTime,
             -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
            ) %>% 
  filter(Route == "X2")

str(X2)

View(head(arrange(X2,
                  Bus_ID,
                  Event_Time
                 ),
          500
         )
    )

X2_ByStop <- group_by(X2,
                      StopID_Clean
                     ) %>% 
  arrange(StopID_Clean,
          Event_Time) %>% 
  mutate(Event_Time_L1 = lag(Event_Time),
         TimeToEvent_Sec = as.numeric(Event_Time - Event_Time_L1),
         TimeToEvent_Min = TimeToEvent_Sec / 60
        )

View(head(X2_ByStop, 500))


# Count_Values is needed to display the medians on the box plots
Count_Values <- ddply(as.data.frame(X2_ByStop),
                      .(Event_Time_HrGroup),
                      summarise,
                      Value_Counts = median(TimeToEvent_Min, na.rm = TRUE)
                     )

TimeBtwEvents_X2_BoxPlot <- ggplot(select(as.data.frame(X2_ByStop),
                                          TimeToEvent_Min,
                                          Event_Time_HrGroup
                                         ),
                                   aes(factor(Event_Time_HrGroup),
                                       TimeToEvent_Min,
                                       fill = factor(Event_Time_HrGroup)
                                      )
                                  ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 120)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Hour Group",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_BoxPlot

Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Hour Group).


TimeBtwEvents_X2_ViolinPlot <- ggplot(select(as.data.frame(X2_ByStop),
                                             TimeToEvent_Min,
                                             Event_Time_HrGroup
                                             ),
                                      aes(factor(Event_Time_HrGroup),
                                          TimeToEvent_Min,
                                          fill = factor(Event_Time_HrGroup)
                                         )
                                     ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 80)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Hour Group",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_ViolinPlot

Test investigation of just the X2 Route. Box plots for time between bus arrivals (by Zip Code).


# Count_Values is needed to display the medians on the box plots
Count_Values_z <- ddply(as.data.frame(X2_ByStop),
                        .(Stop_Zip),
                        summarise,
                        Value_Counts = median(TimeToEvent_Min, na.rm = TRUE)
                       )

TimeBtwEvents_X2_BoxPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
                                            TimeToEvent_Min,
                                            Stop_Zip
                                           ),
                                     aes(factor(Stop_Zip),
                                         TimeToEvent_Min,
                                         fill = factor(Stop_Zip)
                                        )
                                    ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = Count_Values_z,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 100)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Zip Code of Destination",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_BoxPlot_z

Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).


TimeBtwEvents_X2_ViolinPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
                                               TimeToEvent_Min,
                                               Stop_Zip
                                               ),
                                        aes(factor(Stop_Zip),
                                            TimeToEvent_Min,
                                            fill = factor(Stop_Zip)
                                           )
                                       ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = Count_Values_z,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 60)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Zip Code of Destination",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_ViolinPlot_z

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

First, get the max and min times of bus stops (each day, and for each route).


rm(X2, X2_ByStop, X2_Long, X2_Pct)


RouteMinMax <- group_by(NewTravTime,
                        Route,
                        Event_Time_Date
                       ) %>% 
  summarise(MinTime = min(Event_Time),
            MaxTime = max(Event_Time)
           )

str(RouteMinMax)
View(RouteMinMax)

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

(Pulls here are done by day, as the data are too large to do at once.)


# View(head(NewTravTime, 500))

# For each record, create a random datetime between the first and last stop for that bus route (on that day).
for(i in 3:7){

set.seed(123456789)
Samp <- select(NewTravTime,
               RowNum_OG,
               Route,
               # RouteGroup,
               Event_Time_Date,
               StopID_Clean,
               starts_with("Event")
              ) %>% 
  filter(Event_Time_Date == i) %>%  # needed to do this each day (3-7) because the complete file was too large to do at once
  left_join(RouteMinMax,
            by = c("Route" = "Route",
                   "Event_Time_Date" = "Event_Time_Date"
                  )
           ) %>% 
  mutate(SampTime = as_datetime(runif(nrow(.), #200000,
                                      min = MinTime,
                                      max = MaxTime
                                     ),
                                tz = "America/New_York"
                               )
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) 

# str(Samp)
# View(head(Samp, 500))
# 
# View(
# group_by(Samp,
#          RowNum_OG
#         ) %>%
#   summarise(Cnt_Num = n(),
#             Cnt_Pct = 100 * Cnt_Num / nrow(Samp)
#            ) %>%
#   arrange(desc(Cnt_Num))
# )


# For each Route and StopID combination, get all the Event_Time values that are after the SampTime value.
# estimating approx 2hrs of runtime for all 2.8m records
Testing_A <- sqldf("   Select               t1.*
                                            ,t2.Event_Time             as NextBus
                        From                 Samp                      as t1
                             Inner Join      Samp                      as t2
                                On              t1.Route = t2.Route
                                And             t1.StopID_Clean = t2.StopID_Clean
                                And             t2.Event_Time > t1.SampTime
                        Order By             t1.Route
                                            ,t1.StopID_Clean
                                            ,t1.Event_Time
                                            ,t2.Event_Time
                  "
                 ) %>% 
  mutate(NB = as_datetime(NextBus,
                          tz = "America/New_York"
                         )
        )

# str(Testing_A)
# View(head(Testing_A, 500))
# View(head(Samp, 500))


# Filter the dataframe to only include the bus arrival at StopID that is the next to come after the SampTime.
# estimating approx 20min of runtime for all 2.8m records
Testing <- select(Testing_A,
                  -NextBus
                 ) %>% 
  group_by(RowNum_OG) %>% 
  filter(NB == min(NB)
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) %>% 
  mutate(WaitTime_Min = as.numeric(NB - SampTime),
         WaitTime_Sec = WaitTime_Min * 60,
         WaitTime_Sec2 = NB - SampTime,
         WaitTime_Min2 = WaitTime_Sec2 / 60
        ) %>% 
  as.data.frame()

assign(paste0("Testing_", i),
       Testing
      )

rm(Samp,Testing_A, Testing)
str(get(paste0("Testing_", i)))
View(get(paste0("Testing_", i)))
}


# Bind all the individual dataframes together.
WaitData_DayPull <- bind_rows(Testing_3,
                              Testing_4,
                              Testing_5,
                              Testing_6,
                              Testing_7
                             ) %>% 
  mutate(WaitTime_Sec3 = NB - SampTime,
         WaitTime_Min3 = WaitTime_Sec3 / 60
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )

rm(Testing_3, Testing_4, Testing_5, Testing_6, Testing_7)
str(WaitData_DayPull)
View(head(WaitData_DayPull, 500))
View(tail(WaitData_DayPull, 500))

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

Basic investigation of any missing rows from data pulled by day.


DistinctRowNum_OG <- distinct(select(WaitData_DayPull,
                                     RowNum_OG
                                    )
                             )

str(DistinctRowNum_OG)

# View(
# anti_join(Samp,
#           DistinctRowNum_OG,
#           by = c("RowNum_OG" = "RowNum_OG")
#          )
# )


# The samp time is AFTER the last bus passed that StopID_Clean
# View(filter(Samp,
#             Event_Time > "2016-10-07 19:48:41" &
#               Route == "X2" &
#               StopID_Clean == 1003774
#            )
#     )

# Next Bus (NB) can be on the next morning
# View(filter(Testing7,
#             SampTime > "2016-10-06 23:58:00" &
#               SampTime < "2016-10-06 23:59:59")
#     )

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)

First, we need to find the most common bus routes.


rm(DistinctRowNum_OG)


# View(head(NewTravTime, 500))

set.seed(123456789)
BusGroups <- group_by(NewTravTime,
                      Route
                     ) %>% 
  summarise(Cnt_Num = n(),
            Cnt_Pct = Cnt_Num / nrow(NewTravTime)
           ) %>% 
  arrange(desc(Cnt_Num)
         ) %>% 
  mutate(RowNum = row_number(),
         RandNum = runif(n = 268),
         RouteGroup = ifelse(RandNum <= 0.2,
                             1,
                      ifelse(RandNum <= 0.4,
                             2,
                      ifelse(RandNum <= 0.6,
                             3,
                      ifelse(RandNum <= 0.8,
                             4,
                             5
                            ))))
        )

str(BusGroups)
View(BusGroups)
summary(BusGroups)

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)


# View(head(NewTravTime, 500))

# For each record, create a random datetime between the first and last stop for that bus route (on that day).
for(i in 1:5){
  
set.seed(123456789)
Samp <- left_join(NewTravTime,
                  BusGroups,
                  by = c("Route" = "Route")
                  ) %>% 
  select(RowNum_OG,
         Route,
         RouteGroup,
         Event_Time_Date,
         StopID_Clean,
         starts_with("Event")
        ) %>% 
  filter(RouteGroup == i) %>%  # needed to do this each RouteGroup (1-5) because the complete file was too large to do at once
  left_join(RouteMinMax,
            by = c("Route" = "Route",
                   "Event_Time_Date" = "Event_Time_Date"
                  )
           ) %>% 
  mutate(SampTime = as_datetime(runif(nrow(.), #200000,
                                      min = MinTime,
                                      max = MaxTime
                                     ),
                                tz = "America/New_York"
                               )
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) 

# str(Samp)
# View(head(Samp, 500))
# 
# View(
# group_by(Samp,
#          RowNum_OG
#         ) %>%
#   summarise(Cnt_Num = n(),
#             Cnt_Pct = 100 * Cnt_Num / nrow(Samp)
#            ) %>%
#   arrange(desc(Cnt_Num))
# )


# For each Route and StopID combination, get all the Event_Time values that are after the SampTime value.
# estimating approx 2hrs of runtime for all 2.8m records
Testing_A <- sqldf("   Select               t1.*
                                            ,t2.Event_Time             as NextBus
                        From                 Samp                      as t1
                             Inner Join      Samp                      as t2
                                On              t1.Route = t2.Route
                                And             t1.StopID_Clean = t2.StopID_Clean
                                And             t2.Event_Time > t1.SampTime
                        Order By             t1.Route
                                            ,t1.StopID_Clean
                                            ,t1.Event_Time
                                            ,t2.Event_Time
                  "
                 ) %>% 
  mutate(NB = as_datetime(NextBus,
                          tz = "America/New_York"
                         )
        )

# str(Testing_A)
# View(head(Testing_A, 500))
# View(head(Samp, 500))


# Filter the dataframe to only include the bus arrival at StopID that is the next to come after the SampTime.
# estimating approx 20min of runtime for all 2.8m records
Testing <- select(Testing_A,
                  -NextBus
                 ) %>% 
  group_by(RowNum_OG) %>% 
  filter(NB == min(NB)
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) %>% 
  mutate(WaitTime_Min = as.numeric(NB - SampTime),
         WaitTime_Sec = WaitTime_Min * 60
        ) %>% 
  as.data.frame()

assign(paste0("Testing", i),
       Testing
      )

rm(Samp,Testing_A, Testing)
str(get(paste0("Testing", i)))
View(get(paste0("Testing", i)))
}


# Bind all the individual dataframes together.
WaitData_RoutePull <- bind_rows(Testing1,
                                Testing2,
                                Testing3,
                                Testing4,
                                Testing5
                             ) %>% 
  mutate(WaitTime_Sec2 = NB - SampTime,
         WaitTime_Min2 = WaitTime_Sec2 / 60
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )

rm(BusGroups, i, Testing3, Testing4, Testing5, Testing6, Testing7)
str(WaitData_RoutePull)
View(head(WaitData_RoutePull, 500))
View(tail(WaitData_RoutePull, 500))

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

Compare WaitData pulled by day and pulled by route.


dim(WaitData_RoutePull)
dim(WaitData_DayPull)
nrow(WaitData_RoutePull) - nrow(WaitData_DayPull)

WaitData_Diff <- anti_join(WaitData_RoutePull,
                           WaitData_DayPull,
                           by = c("RowNum_OG" = "RowNum_OG"
                                 )
                          ) %>% 
  select(-WaitTime_Min,
         -WaitTime_Sec
        )

str(WaitData_Diff)
View(head(WaitData_Diff, 500))

View(filter(WaitData_RoutePull,
            Route == "Z8" &
              StopID_Clean == 2005465
            # RowNum_OG = 2902760
            # Event_Time = 2016-10-07 19:51:47
           )
    )

View(group_by(WaitData_Diff,
              Route
             ) %>% 
       summarise(Cnt_Num = n(),
                 Cnt_Pct = Cnt_Num / nrow(WaitData_Diff)
                ) %>% 
       arrange(desc(Cnt_Num)
              )
    )

View(filter(WaitData_Diff,
            Route == "S1"
           )
    )

View(filter(WaitData_RoutePull,
            Route == "S1" &
              StopID_Clean == 1003132
            # RowNum_OG = 1151770
            # Event_Time = 2016-10-07 09:07:12
           )
    )

# Can't tell why the pull by day has less records than the pull by route

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

Compare WaitData (pulled by route) and original data (NewTravTime).


dim(NewTravTime)  # 2,809,529 rows
dim(WaitData_RoutePull)  # 2,780,848 rows
nrow(NewTravTime) - nrow(WaitData_RoutePull)  # is 28,681 rows

str(select(NewTravTime,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(WaitData_RoutePull)

Compare_NTT_WD <- left_join(NewTravTime,
                            select(WaitData_RoutePull,
                                   RowNum_OG,
                                   # Route,
                                   RouteGroup,
                                   # StopID_Clean,
                                   # Event_Time,
                                   MinTime,
                                   MaxTime,
                                   SampTime,
                                   NB,
                                   WaitTime_Sec2,
                                   WaitTime_Min2
                                  ),
                            by = c("RowNum_OG" = "RowNum_OG")
                           ) %>% 
  select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )

str(Compare_NTT_WD)  # 2,810,109 rows overall  --  29,261 rows with no match
View(head(Compare_NTT_WD, 500))
View(filter(Compare_NTT_WD,
            is.na(MinTime)
           )
    )



# View(anti_join(Samp,
#                distinct(select(WaitData_RoutePull,
#                                RowNum_OG
#                               )
#                        ),
#                by = c("RowNum_OG" = "RowNum_OG")
#               )
#     )

# The SampTime is AFTER the last bus passed that StopID_Clean
# View(filter(Samp,
#               Route == "X2" &
#               StopID_Clean == 1003774
#             # RowNum_OG = 1146723
#             # Event_Time = 2016-10-07 15:32:18
#            )
#     )

Clean up the data a bit.


rm(BusGroups, RouteMinMax, Samp, Testing1, Testing2, Testing3, Testing4, Testing5, Testing_3, Testing_4, Testing_5, Testing_6, Testing_7, WaitData_DayPull, WaitData_Diff)


str(Compare_NTT_WD)
View(head(Compare_NTT_WD, 500))
View(head(mutate(Compare_NTT_WD,
                 WT_Min = as.numeric(WaitTime_Min2)
                )
         )
    )

WaitTime_AsNum <- Compare_NTT_WD %>% 
  mutate(RouteStop_ID = factor(paste(Route, StopID_Clean, sep = "__")
                              )
        )
WaitTime_AsNum$WaitTime_Sec2 <- as.numeric(WaitTime_AsNum$WaitTime_Sec2)
WaitTime_AsNum$WaitTime_Min2 <- as.numeric(WaitTime_AsNum$WaitTime_Min2)

rm(Compare_NTT_WD)
str(WaitTime_AsNum)

General exploration of wait times.


summary(WaitTime_AsNum$WaitTime_Min2)

General exploration of wait times.


WT_Quantiles <- as.data.frame(quantile(WaitTime_AsNum$WaitTime_Min2,
                                       probs = seq(0, 1, 0.01),
                                       na.rm = TRUE
                                      )
                             )

colnames(WT_Quantiles) <- "Value_Min"

WT_Quantiles$Value_Sec = format(round(WT_Quantiles$Value_Min * 60,
                                      digits = 2
                                     ),
                                nsmall = 2
                               )
WT_Quantiles$Value_Hr = format(round(WT_Quantiles$Value_Min / 60,
                                     digits = 2
                                    ),
                                nsmall = 2
                               )
WT_Quantiles$Value_Min = format(round(WT_Quantiles$Value_Min,
                                      digits = 2
                                     ),
                                nsmall = 2
                               )

WT_Quantiles$Quantile <- seq(0, 1, 0.01)

WT_Quantiles <- select(WT_Quantiles,
                       Quantile,
                       Value_Sec,
                       Value_Min,
                       Value_Hr
                      )

str(WT_Quantiles)
View(WT_Quantiles)
WT_Quantiles


View(arrange(WaitTime_AsNum,
             desc(WaitTime_Min2)
            ) %>% 
       head(., 5000)
    )

View(filter(WaitTime_AsNum,
            between(WaitTime_Min2, 60, 200)
           ) %>% 
       arrange(desc(WaitTime_Min2)
              ) 
     # %>% 
     #   head(., 5000)
    )

# Example of extreme wait times
View(filter(WaitTime_AsNum,
            Route == "W13" &  # only 2 bus passes in the entire dataset
              StopID_Clean == 1003728
            # Event_Time = 2016-10-03 08:42:46
           )
    )

# Example of extreme wait times
View(filter(WaitTime_AsNum,
            Route == "S41" &  # only 4 bus passes in the entire dataset
              StopID_Clean == 1001095
            # Event_Time = 2016-10-05 15:41:47
           )
    )

# Example of extreme wait times
View(filter(WaitTime_AsNum,
            Route == "D8" &  # route has VERY limited service after midnight
              StopID_Clean == 1001669
            # Event_Time = 2016-10-06 20:31:16
           )
    )

Looks like there might be an issue in wait times when very few Route-Stop combinations are included in the dataset. Let’s explore these.


RouteStop_Cnts <- group_by(WaitTime_AsNum,
                           RouteStop_ID
                          ) %>% 
  summarise(RouteStop_CntNum = n(),
            RouteStop_CntPct = RouteStop_CntNum / nrow(WaitTime_AsNum)
           ) %>% 
  arrange(RouteStop_CntNum)

View(RouteStop_Cnts)


RouteStop_CntOfCnt <- group_by(RouteStop_Cnts,
                               RouteStop_CntNum
                              ) %>% 
  summarise(RouteStopCnt_CntNum = n(),
            RouteStopCnt_CntPct = RouteStopCnt_CntNum / nrow(RouteStop_Cnts)
           ) %>% 
  mutate(RouteStopCnt_CntPct_CumSum = cumsum(RouteStopCnt_CntPct),
         x = 1 - RouteStopCnt_CntPct_CumSum
        ) %>% 
  arrange(RouteStop_CntNum)
  
 View(RouteStop_CntOfCnt)
 RouteStop_CntOfCnt

Histogram of the counts of Route-StopID combinations.


RouteStop_Cnts_Bar <- ggplot(RouteStop_CntOfCnt,
                             aes(x = RouteStop_CntNum,
                                 # y = ..density..
                                 y = RouteStopCnt_CntNum
                                )
                            ) +
  # geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
  coord_cartesian(xlim = c(0, 500)
                  # ylim = c(0, 0.02)
                 ) +
  labs(title = "Variation in Routes Passing a Specific Stop",
       x = "Occurrences of Route-StopID Combiantions",
       y = "Counts"
      )

RouteStop_Cnts_Bar

Create a new dataset limiting extremely small counts of Route-StopID combinations.


WaitTime_RteCnts <- left_join(WaitTime_AsNum,
                              RouteStop_Cnts,
                              by = c("RouteStop_ID" = "RouteStop_ID")
                             ) %>% 
  select(-RouteStop_CntPct)

dim(WaitTime_AsNum)
dim(WaitTime_RteCnts)

rm(WaitTime_AsNum)
str(WaitTime_RteCnts)


# Total rows
nrow(WaitTime_RteCnts)

# Rows of rare RouteStops
nrow(filter(WaitTime_RteCnts,
            RouteStop_CntNum <= 60
           )
    ) / nrow(WaitTime_RteCnts)

# Rows of extremely long wait times
nrow(filter(WaitTime_RteCnts,
            WaitTime_Min2 > 180
           )
    ) / nrow(WaitTime_RteCnts)


select(WaitTime_RteCnts,
       WaitTime_Min2
      ) %>% 
  summary()

filter(WaitTime_RteCnts,
       RouteStop_CntNum > 60  # 12 passes per day in a 5-day dataset
      ) %>% 
  select(WaitTime_Min2) %>% 
  summary()

filter(WaitTime_RteCnts,
       WaitTime_Min2 < 180  # probably means that something went wrong
      ) %>% 
  select(WaitTime_Min2) %>% 
  summary()

Compare quantiles in the limited datasets.


a <- as.data.frame(select(WaitTime_RteCnts,
                          WaitTime_Min2
                         ) %>% 
                     quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
                  )

b <- as.data.frame(filter(WaitTime_RteCnts,
                          RouteStop_CntNum > 60
                         ) %>% 
                     select(WaitTime_Min2) %>% 
                     quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
                  )

c <- as.data.frame(filter(WaitTime_RteCnts,
                          WaitTime_Min2 < 180
                         ) %>% 
                     select(WaitTime_Min2) %>% 
                     quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
                  )

WT_Filter_Quantiles <- bind_cols(a, b, c) %>% 
  mutate(Quantile = seq(0, 1, 0.01)
        )

colnames(WT_Filter_Quantiles) <- c("All", "RteStpAbv60", "WTBlw180", "Quantile")
rm(a, b, c)
View(WT_Filter_Quantiles)
WT_Filter_Quantiles

Histogram of all wait times.


WaitTime_AllBus_HistDen <- ggplot(filter(select(WaitTime_RteCnts,
                                                WaitTime_Min2
                                               ),
                                         !is.na(WaitTime_Min2)
                                        ),
                                  aes(x = WaitTime_Min2,
                                      y = ..density..
                                     )
                                ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  scale_x_continuous(breaks = seq(0, 300, 30)
                    ) +
  coord_cartesian(xlim = c(0, 300),
                  ylim = c(0, 0.035)
                 ) +
  labs(title = "Variation in Wait Time",
       x = "Wait Time (min)",
       y = "Density"
      )

WaitTime_AllBus_HistDen

Box plots for WaitTime (all busses, by Zip Code).


# Count_Values is needed to display the medians on the box plots
BusRoute <- select(WaitTime_RteCnts,
                   Route,
                   WaitTime_Min2,
                   Stop_Zip
                  ) %>% 
  filter(Route == "X2")

CountValues_AllBus_Zip <- ddply(BusRoute,
                                .(Stop_Zip),
                                summarise,
                                Value_Counts = median(WaitTime_Min2, na.rm = TRUE)
                               )

WaitTime_AllBus_Zip_Box <- ggplot(BusRoute,
                                  aes(factor(Stop_Zip),
                                      WaitTime_Min2,
                                      fill = factor(Stop_Zip)
                                     )
                                 ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_Zip,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for the X2)",
       x = "Zip Code of Destination",
       y = "Waiting Time (min)"
      )

WaitTime_AllBus_Zip_Box

Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).


WaitTime_AllBus_Zip_Violin <- ggplot(BusRoute,
                                     aes(factor(Stop_Zip),
                                         WaitTime_Min2,
                                         fill = factor(Stop_Zip)
                                        )
                                    ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_Zip,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for the X2)",
       x = "Zip Code of Destination",
       y = "Waiting Time (min)"
      )

TimeBtwEvents_X2_ViolinPlot_z

Box plots for WaitTime (Zip Code, by HourGroupZip).


# Count_Values is needed to display the medians on the box plots
Zip <- select(WaitTime_RteCnts,
              Route,
              WaitTime_Min2,
              Stop_Zip,
              Event_Time_HrGroup
             ) %>% 
  filter(Stop_Zip == 20002)

CountValues_AllBus_HG <- ddply(Zip,
                               .(Event_Time_HrGroup),
                               summarise,
                               Value_Counts = median(WaitTime_Min2,
                                                     na.rm = TRUE
                                                    )
                               )

WaitTime_AllBus_HG_Box <- ggplot(Zip,
                                 aes(factor(Event_Time_HrGroup),
                                     WaitTime_Min2,
                                     fill = factor(Event_Time_HrGroup)
                                    )
                                ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_HG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
       x = "Hour Group",
       y = "Waiting Time (min)"
      )
  # facet_wrap(~Stop_Zip
  #            # nrow = 5
  #           )

WaitTime_AllBus_HG_Box

Violin plots for WaitTime (Zip Code, by HourGroupZip).


WaitTime_AllBus_HG_Vln <- ggplot(Zip,
                                 aes(factor(Event_Time_HrGroup),
                                     WaitTime_Min2,
                                     fill = factor(Event_Time_HrGroup)
                                    )
                                ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_HG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 90)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
       x = "Hour Group",
       y = "Waiting Time (min)"
      )
  # facet_wrap(~Stop_Zip
  #            # nrow = 5
  #           )

WaitTime_AllBus_HG_Vln

Box plots for WaitTime (Route, by HourGroupZip).


# Count_Values is needed to display the medians on the box plots
Rte <- select(WaitTime_RteCnts,
              Route,
              WaitTime_Min2,
              Stop_Zip,
              Event_Time_HrGroup
             ) %>% 
  filter(Route == "X2")

CountValues_AllBus_RteHG <- group_by(Rte,
                                     Event_Time_HrGroup
                                    ) %>% 
  summarise(
    Value_Counts = median(WaitTime_Min2,
                          na.rm = TRUE
                         ),
    VC = quantile(WaitTime_Min2, probs = 0.9, na.rm = TRUE)
    )


WaitTime_AllBus_RteHG_Box <- ggplot(Rte,
                                    aes(factor(Event_Time_HrGroup),
                                        WaitTime_Min2,
                                        fill = factor(Event_Time_HrGroup)
                                       )
                                   ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_RteHG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, max(CountValues_AllBus_RteHG$VC))
                 ) +
  labs(title = "Waiting Time at a Given Stop",
       subtitle = ("Route X2"),
       x = "Hour Group",
       y = "Waiting Time (min)"
      ) 
# +
#   facet_wrap(~Stop_Zip
#              # nrow = 5
#             )

WaitTime_AllBus_RteHG_Box

Violin plots for WaitTime (Zip Code, by HourGroupZip).


WaitTime_AllBus_RteHG_Vln <- ggplot(Rte,
                                    aes(factor(Event_Time_HrGroup),
                                        WaitTime_Min2,
                                        fill = factor(Event_Time_HrGroup)
                                       )
                                   ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_RteHG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop",
       subtitle = ("(Route X2)"),
       x = "Hour Group",
       y = "Waiting Time (min)"
      ) +
  facet_wrap(~Stop_Zip
             # nrow = 5
            )

WaitTime_AllBus_RteHG_Vln

X2 Percentiles Line Graph Test.


X2_Pct <- select(WaitTime_RteCnts,
                 Route,
                 Stop_Zip,
                 Event_Time_Date,
                 Event_Time_Day,
                 Event_Time_HrGroup,
                 Event_Time_Hr,
                 Latitude,
                 Longitude,
                 WaitTime_Min2
                ) %>% 
  filter(Route == "X2") %>% 
  group_by(Event_Time_Hr,
           Stop_Zip
          ) %>% 
  summarise(Pct50 = quantile(WaitTime_Min2, probs = 0.5, na.rm = TRUE),
            Pct60 = quantile(WaitTime_Min2, probs = 0.6, na.rm = TRUE),
            Pct70 = quantile(WaitTime_Min2, probs = 0.7, na.rm = TRUE),
            Pct80 = quantile(WaitTime_Min2, probs = 0.8, na.rm = TRUE),
            Pct90 = quantile(WaitTime_Min2, probs = 0.9, na.rm = TRUE)
           )

str(X2_Pct)
View(X2_Pct)


X2_Long <- gather(X2_Pct,
                  key = Percentile,
                  value = Pctile,
                  Pct50,
                  Pct60,
                  Pct70,
                  Pct80,
                  Pct90
                )

str(X2_Long)
View(X2_Long)


X2_WaitByHr_Line <- ggplot(X2_Long,
                           aes(x = Event_Time_Hr,
                               y = Pctile,
                               factor(Percentile),
                               color = Percentile
                              )
                          ) +
  geom_line() +
  theme(legend.title=element_blank(),
        legend.position = "bottom"
       ) +
  coord_cartesian(xlim = c(0, 23)
                  # ylim = c(0, 45)
                 ) + 
  scale_x_continuous(breaks = seq(0, 23, 2)
                    ) +
  labs(title = "Waiting Time Throughout the Day",
       subtitle = ("(Route X2)"),
       x = "Hour of the Day",
       y = "Waiting Time (min)"
      ) +
  facet_wrap(~Stop_Zip)

X2_WaitByHr_Line

GET DATA READY FOR SHINY – GET DATA READY FOR SHINY – GET DATA READY FOR SHINY GET DATA READY FOR SHINY – GET DATA READY FOR SHINY – GET DATA READY FOR SHINY GET DATA READY FOR SHINY – GET DATA READY FOR SHINY – GET DATA READY FOR SHINY

BaseData: Used in plots by hour and zipcode (first two Shiny tabs).


# str(WaitTime_RteCnts)

Shiny_WaitData_Base <- select(WaitTime_RteCnts,
                              Route,
                              Stop_Zip,
                              Event_Time,
                              Event_Time_Date,
                              Event_Time_Day,
                              Event_Time_HrGroup,
                              Event_Time_Hr,
                              Latitude,
                              Longitude,
                              WaitTime_Min2
                             ) %>% 
  mutate(Event_Time_YrMthDayHr = floor_date(Event_Time, "hour")
        ) %>% 
  rename(ZipCode = Stop_Zip,
         HourGroup = Event_Time_HrGroup,
         Date = Event_Time_Date,
         Day = Event_Time_Day,
         Hour = Event_Time_Hr,
         WaitTime_Min = WaitTime_Min2
        ) %>% 
  filter(WaitTime_Min <= 180)

Shiny_WaitData_Base$Route <- factor(Shiny_WaitData_Base$Route)

str(Shiny_WaitData_Base)
View(tail(Shiny_WaitData_Base, 500))

saveRDS(Shiny_WaitData_Base,
        "Shiny_WaitData_Base.rds"
       )

Prep data for mapping.


# devtools::install_github("dkahle/ggmap")
# devtools::install_github("hadley/ggplot2")
# install.packages("ggmap", type = "source")

# devtools::install_github('hadley/ggplot2')
devtools::install_github("hadley/ggplot2@v2.2.0")
# devtools::install_github('thomasp85/ggforce')
# devtools::install_github('thomasp85/ggraph')
# devtools::install_github('slowkow/ggrepel')


tract <- 
  readOGR(dsn = "/Users/mdturse/Desktop/Analytics/DCMetroBus/tl_2016_us_zcta510",
          layer = "tl_2016_us_zcta510"
         )
  
class(tract)

# convert the GEOID to a character
tract@data$GEOID <- as.character(tract@data$GEOID)
str(tract@data)


ggtract <- tidy(tract, region = "GEOID")

# str(ggtract)
# summary(ggtract)
# View(head(ggtract, 50))



# str(Shiny_WaitData_Base)

ZipWaitTest <- filter(Shiny_WaitData_Base,
                      WaitTime_Min <= 180 &
                        !is.na(ZipCode)
                     ) %>% 
  group_by(ZipCode,
           Event_Time_YrMthDayHr
           # Event_Time_Day,
           # Event_Time_Hr
          ) %>% 
  summarise(Pct80 = quantile(WaitTime_Min, probs = 0.8, na.rm = TRUE)
           ) %>% 
  arrange(# Event_Time_Hr,
          ZipCode,
          Event_Time_YrMthDayHr
         ) %>% 
  as.data.frame() %>% 
  mutate(Event_Time_DateNew = floor_date(Event_Time_YrMthDayHr, "day"),
         Event_Time_HrNew = hour(Event_Time_YrMthDayHr),
         Pct80_Level = factor(ifelse(Pct80 < 10,
                                     "Below 10",
                              ifelse(Pct80 < 20,
                                     "Below 20",
                              ifelse(Pct80 < 30,
                                     "Below 30",
                              ifelse(Pct80 < 40,
                                     "Below 40",
                              ifelse(Pct80 < 50,
                                     "Below 50",
                              ifelse(Pct80 < 60,
                                     "Below 60",
                                     "60 Plus"
                                    )))))),
                              levels = c("Below 10", "Below 20", "Below 30", 
                                         "Below 40", "Below 50", "Below 60", "60 Plus"
                                        ),
                              ordered = TRUE
                             )
        )

str(ZipWaitTest)
ZipWaitTest$ZipCode <- as.character(ZipWaitTest$ZipCode)
str(ZipWaitTest)
summary(ZipWaitTest)

View(head(ZipWaitTest, 500))


StopZip_Left <- left_join(ZipWaitTest,
                          ggtract,
                          by = c("ZipCode" = "id")
                         )

str(StopZip_Left)
summary(StopZip_Left)

Test mapping functionaltiy.


map <- get_map(location = c(lon = -77.03676, lat = 38.89784),
               source = "google",
               # maptype = "roadmap"
               zoom = 12
              )

ggmap(map) +
  geom_polygon(aes(x = long, 
                   y = lat, 
                   group = group,
                   fill = Pct80_Level
                  ), 
               data = filter(StopZip_Left,
                             Event_Time_YrMthDayHr == as.POSIXct("2016-10-07 20:00:00")
                             # &
                             #   Stop_Zip == "20003"
                            ),
               colour = "gray1", 
               # fill = 'black', 
               alpha = .4, 
               size = .3
              ) +
# +
  # scale_fill_gradientn(colours = c("white", "royalblue4", "red"),
  #                      #  "lightsteelblue4",
  #                      # "lightpink1",
  #                      # values=cbPalette,
  #                      # values = c(1,0.5, .3, .2, .1, 0)
  #                      na.value = "black",
  #                      breaks = c(seq(0, 180, 30))
  #                      # values = rescale()
  #                     ) 
# +
  scale_fill_brewer(palette = "Spectral", # "YlOrRd" # "Set1",
                    direction = -1,
                    limits = levels(StopZip_Left$Pct80_Level)
                   )

Shiny data for mapping (used in 3rd tab).


View(head(filter(StopZip_Left,
                 Event_Time_HrNew == 15
                ),
          500
         )
    )

Shiny_WaitData_Map <- StopZip_Left %>% 
  rename(YrMthDayHr = Event_Time_YrMthDayHr,
         YrMthDay = Event_Time_DateNew,
         Hour = Event_Time_HrNew
        )

str(Shiny_WaitData_Map)


Shiny_WaitData_Map_Wed <- filter(Shiny_WaitData_Map,
                                 YrMthDay == as.POSIXct("2016-10-05")
                                )

str(Shiny_WaitData_Map_Wed)
summary(Shiny_WaitData_Map_Wed)


saveRDS(Shiny_WaitData_Map,
        "Shiny_WaitData_Map.rds"
       )

saveRDS(Shiny_WaitData_Map_Wed,
        "Shiny_WaitData_Map_Wed.rds"
       )

Clustering

Data prep.


rm(tract, ggtract, StopZip_Left, ZipWaitTest, Shiny_WaitData_Base, Shiny_WaitData_Map, Shiny_WaitData_Map_Wed)


dim(NewTravTime)
dim(WaitTime_RteCnts)


str(select(NewTravTime,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(select(NewTravTime,
           matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(WaitTime_RteCnts)

 
 
# ClustData <- select(WaitTime_RteCnts,
#                     group,
#                     BusDay_EventNum,
#                     Route,
#                     RteChange2,
#                     RouteAlt,
#                     DirChange2,
#                     Route_Direction,
#                     Stop_Sequence,
#                     StopID_Indicator,
#                     Stop_County,
#                     Stop_City,
#                     Stop_Zip,
#                     Event_Time_Hr,
#                     Dwell_Time2,
#                     TravelDistance_Mi_NewHvrs,
#                     TravelDistance_Mi_NewHvrs_Label,
#                     TT_Sec_New,
#                     TT_Sec_New_Label,
#                     WaitTime_Min2
#                    ) %>% 
#   filter(WaitTime_Min2 <= 180) %>% 
#   mutate(SpeedAvg_Mph_TDMNH_TTSN = TravelDistance_Mi_NewHvrs / 
#            (TT_Sec_New / 60 / 60)
#         )
# %>% 
#   select_if(ClustData,
#             function(col) is.numeric(col) |
#               is.integer(col)
#            ) %>% 
#    scale()

# str(ClustData)
# View(tail(ClustData, 500))
# rownames(ClustData) <- ClustData$Route
# ClustData$Route <- as.factor(ClustData$Route)
# str(ClustData)
# head(ClustData)


RouteStats <- filter(WaitTime_RteCnts,
                     WaitTime_Min2 <= 180
                    ) %>% 
  mutate(SpeedAvg_Mph_TDMNH_TTSN = TravelDistance_Mi_NewHvrs / (TT_Sec_New / 60 / 60)
        ) %>% 
  group_by(Route) %>% 
  summarise(BusDayEventNum_Mean = mean(BusDay_EventNum, na.rm = TRUE),
            BusDayEventNum_Pct10 = quantile(BusDay_EventNum, probs = 0.10, na.rm = TRUE),
            BusDayEventNum_Pct25 = quantile(BusDay_EventNum, probs = 0.25, na.rm = TRUE),
            BusDayEventNum_Pct50 = quantile(BusDay_EventNum, probs = 0.50, na.rm = TRUE),
            BusDayEventNum_Pct75 = quantile(BusDay_EventNum, probs = 0.75, na.rm = TRUE),
            BusDayEventNum_Pct90 = quantile(BusDay_EventNum, probs = 0.90, na.rm = TRUE),
            StopSequence_Mean = mean(Stop_Sequence, na.rm = TRUE),
            StopSequence_Pct10 = quantile(Stop_Sequence, probs = 0.10, na.rm = TRUE),
            StopSequence_Pct25 = quantile(Stop_Sequence, probs = 0.25, na.rm = TRUE),
            StopSequence_Pct50 = quantile(Stop_Sequence, probs = 0.50, na.rm = TRUE),
            StopSequence_Pct75 = quantile(Stop_Sequence, probs = 0.75, na.rm = TRUE),
            StopSequence_Pct90 = quantile(Stop_Sequence, probs = 0.90, na.rm = TRUE),
            EventTimeHr_Mean = mean(Event_Time_Hr, na.rm = TRUE),
            EventTimeHr_Pct10 = quantile(Event_Time_Hr, probs = 0.10, na.rm = TRUE),
            EventTimeHr_Pct25 = quantile(Event_Time_Hr, probs = 0.25, na.rm = TRUE),
            EventTimeHr_Pct50 = quantile(Event_Time_Hr, probs = 0.50, na.rm = TRUE),
            EventTimeHr_Pct75 = quantile(Event_Time_Hr, probs = 0.75, na.rm = TRUE),
            EventTimeHr_Pct90 = quantile(Event_Time_Hr, probs = 0.90, na.rm = TRUE),
            DwellTime2_Mean = mean(Dwell_Time2, na.rm = TRUE),
            DwellTime2_Pct10 = quantile(Dwell_Time2, probs = 0.10, na.rm = TRUE),
            DwellTime2_Pct25 = quantile(Dwell_Time2, probs = 0.25, na.rm = TRUE),
            DwellTime2_Pct50 = quantile(Dwell_Time2, probs = 0.50, na.rm = TRUE),
            DwellTime2_Pct75 = quantile(Dwell_Time2, probs = 0.75, na.rm = TRUE),
            DwellTime2_Pct90 = quantile(Dwell_Time2, probs = 0.90, na.rm = TRUE),
            TravDistMi_Mean = mean(TravelDistance_Mi_NewHvrs, na.rm = TRUE),
            TravDistMi_Pct10 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.10, na.rm = TRUE
                                       ),
            TravDistMi_Pct25 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.25, na.rm = TRUE
                                       ),
            TravDistMi_Pct50 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.50, na.rm = TRUE
                                       ),
            TravDistMi_Pct75 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.75, na.rm = TRUE
                                       ),
            TravDistMi_Pct90 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.90, na.rm = TRUE
                                       ),
            TravTimSec_Mean = mean(TT_Sec_New, na.rm = TRUE),
            TravTimSec_Pct10 = quantile(TT_Sec_New, probs = 0.10, na.rm = TRUE),
            TravTimSec_Pct25 = quantile(TT_Sec_New, probs = 0.25, na.rm = TRUE),
            TravTimSec_Pct50 = quantile(TT_Sec_New, probs = 0.50, na.rm = TRUE),
            TravTimSec_Pct75 = quantile(TT_Sec_New, probs = 0.75, na.rm = TRUE),
            TravTimSec_Pct90 = quantile(TT_Sec_New, probs = 0.90, na.rm = TRUE),
            WaitTimMin_Mean = mean(WaitTime_Min2, na.rm = TRUE),
            WaitTimMin_Pct10 = quantile(WaitTime_Min2, probs = 0.10, na.rm = TRUE),
            WaitTimMin_Pct25 = quantile(WaitTime_Min2, probs = 0.25, na.rm = TRUE),
            WaitTimMin_Pct50 = quantile(WaitTime_Min2, probs = 0.50, na.rm = TRUE),
            WaitTimMin_Pct75 = quantile(WaitTime_Min2, probs = 0.75, na.rm = TRUE),
            WaitTimMin_Pct90 = quantile(WaitTime_Min2, probs = 0.90, na.rm = TRUE)
           ) %>% 
  as.data.frame()

str(RouteStats)

rownames(RouteStats) <- RouteStats$Route
str(RouteStats)
View(RouteStats)


RouteStats_Scaled <- select(RouteStats,
                            -Route
                           ) %>% 
  scale()

str(RouteStats_Scaled)
class(RouteStats_Scaled)
View(RouteStats_Scaled)

summary(RouteStats)
summary(RouteStats_Scaled)

# <- select_if(ClustData,
#                               function(col) is.numeric(col) |
#                                 is.integer(col)
#                              ) %>% 
  # scale() %>% 
  # as.data.frame() %>% 
  # na.omit()

# str(ClustData_NoFact)
# summary(ClustData_NoFact)

PCA


Trnsfrm <- preProcess(select(RouteStats,
                             -Route
                            ),
                      method = c("BoxCox", "center", "scale", "pca")
                     )

# loadings
Trnsfrm$rotation

RouteStats_Pca <- predict(Trnsfrm, RouteStats) %>% 
  select(-Route)
RouteStats_Pca

Clustering.

Are the data clusterable?


##### Are the data clusterable?
# gradient_col <- list(low = "steelblue", high = "white")
ClustData_Ends <- get_clust_tendency(RouteStats_Pca,
                                     n = nrow(RouteStats_Pca
                                             ) - 1,
                                     # gradient = gradient_col,
                                     seed = 123456789
                                    )

str(ClustData_Ends)

# Hopkins statistic
ClustData_Ends$hopkins_stat  # value of 0.1657494 implies that the data are not uniformly distributed (they are "clusterable")

#plot
ClustData_Ends$plot

Clustering. How many clusters are there?

kmeans, pam, and hierarchical clustring methods, using within sum of squares and silhouette measures.


# class(RouteStats_Pca)

fviz_nbclust(RouteStats_Pca, kmeans, method = "wss")  # ~8 clusters
fviz_nbclust(RouteStats_Pca, pam, method = "wss")  # ~6 clusters
fviz_nbclust(RouteStats_Pca, hcut, method = "wss")  # ~6 clusters

fviz_nbclust(RouteStats_Pca, kmeans, method = "silhouette")  # 2 clusters
fviz_nbclust(RouteStats_Pca, pam, method = "silhouette")  # 2 clusters
fviz_nbclust(RouteStats_Pca, hcut, method = "silhouette",
             hc_method = "complete")  # 2 clusters

Clustering. How many clusters are there?

kmeans method with the gap statistic, using bootstrap.


# Compute gap statistic
# kmeans version
set.seed(123456789)
# system.time(
gap_stat_km <- clusGap(RouteStats_Pca,
                       FUN = kmeans,
                       nstart = 25,
                       K.max = 10,
                       B = 500
                      )
# )

# Print
print(gap_stat_km, method = "Tibs2001SEmax")
print(gap_stat_km)


# pam version
set.seed(123456789)
gap_stat_pm <- clusGap(RouteStats_Pca,
                       FUN = pam,
                       K.max = 10,
                       B = 500
                      )

# Print
print(gap_stat_pm, method = "Tibs2001SEmax")
print(gap_stat_pm)


# hierarchical version
set.seed(123456789)
gap_stat_hcut <- clusGap(RouteStats_Pca,
                         FUN = hcut,
                         K.max = 10,
                         B = 500
                        )

# Print
print(gap_stat_hcut, method = "Tibs2001SEmax")
print(gap_stat_hcut)



# Plot kmeans
fviz_gap_stat(gap_stat_km, 
              maxSE = list(method = "Tibs2001SEmax")
             )  # 1 cluster

# Plot pam
fviz_gap_stat(gap_stat_pm, 
              maxSE = list(method = "Tibs2001SEmax")
             )  # 2 cluster

# Plot hierarchical
fviz_gap_stat(gap_stat_hcut, 
              maxSE = list(method = "Tibs2001SEmax")
             )  # 1 cluster

Clustering. How many clusters are there?

kmeans method with various different statistics.


# str(iris)

nb <- NbClust(RouteStats_Pca, #scale(iris[ ,-5]),
              distance = "euclidean",
              min.nc = 2,
              max.nc = 15,
              method = "kmeans",
              index = "all"
             )

fviz_nbclust(nb) + theme_minimal()

Clustering. How many clusters are there?

Hierarchical clustering method. Particularly looking at silhouette statistics.

# Visualize
HCDend_K2
HCDend_K3

HCDend_K4

HCDend_K5

HCDend_K6

HCDend_K7

HCDend_K8

HCDend_K9

HCDend_K10

HCDend_K11

HCDend_K12

HCDend_K13

HCDend_K14

HCDend_K15

HCSil_K2

HCSil_K3

HCSil_K4

HCSil_K5

HCSil_K6

HCSil_K7

HCSil_K8

HCSil_K9

HCSil_K10

HCSil_K11

HCSil_K12

HCSil_K13

HCSil_K14

HCSil_K15

HCSilWidth_AllK

Using kmeans, PAM, and Hierarchical clustering methods, we can say we probably have aroun 2 clusters.

Let’s try density clustering. (This tends to show that maybe there is only one “cluster,” meaning that data are not clusterable.)

# Compute DBSCAN using fpc package
kNNdistplot(RouteStats_Pca, k = 10)
abline(h = 8.5, lty = 2)
set.seed(123456789)
db <- fpc::dbscan(RouteStats_Pca,
                  eps = 8.5,
                  MinPts = 10
                )
str(db)
List of 4
 $ cluster: num [1:268] 1 1 1 1 1 1 1 1 1 1 ...
 $ eps    : num 8.5
 $ MinPts : num 10
 $ isseed : logi [1:268] TRUE TRUE TRUE TRUE TRUE TRUE ...
 - attr(*, "class")= chr "dbscan"
db
dbscan Pts=268 MinPts=10 eps=8.5
       0   1
border 5   7
seed   0 256
total  5 263
# Plot DBSCAN results
fviz_cluster(db,
             RouteStats_Pca,
             stand = FALSE,
             frame = FALSE,
             geom = "point"
            )
argument frame is deprecated; please use ellipse instead.

Investigating TravelTime_Sec.


View(filter(TTLargeRteChng,
            !is.na(TravelTime_Sec) &
              RteChange2 == "Same"
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )


# examples where TravelTime_Sec is small (1 sec) and SpeedAvg_Mph_NewHvrs is large.
View(select(NewTravTime,
            # -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
            -(TD_Mi_q2:TD_Mi_SSHG_Cnt_F),
            -(TT_Hr_q2:TT_Hr_SSHG_Cnt_F)
           ) %>% 
       filter((RowNum_OG >= 2217353 & RowNum_OG <= 2217373) | # 2217363
                (RowNum_OG >= 3090321 & RowNum_OG <= 3090341) | # 3090331
                (RowNum_OG >= 80764 & RowNum_OG <= 80784) | # 80774
                (RowNum_OG >= 33840 & RowNum_OG <= 33860) # 33850
           )
    )






# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(filter(TTLargeRteChng,
            (RowNum_OG >= 2250290 & RowNum_OG <= 2250310) | # 2250300
              (RowNum_OG >= 867717 & RowNum_OG <= 867737) | # 867727
              (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
              (RowNum_OG >= 808395 & RowNum_OG <= 808415) # 808405
           )
    )

         
         
# examples where TravelTime_Sec is unusually small (with TravelDistance_Mi values that are large).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1042228 & RowNum_OG <= 1042248) | # 1042238
                (RowNum_OG >= 53816 & RowNum_OG <= 53836) | # 53826
                (RowNum_OG >= 360571 & RowNum_OG <= 360591) | # 360581
                (RowNum_OG >= 502271 & RowNum_OG <= 502291) # 502281 (can't explian the weird TravelTime_Sec calculation here - it's not even an integer!)
           )
    )

# still trying to explain 502281...on the day of this weirdness, the bus was only in circulation for 4-5 stops (~20 minutes) on that day (Oct 6)
View(filter(AllDays_NewTravelDist,
            Bus_ID == 2711
           )
    )


# exploring large values for TravelTime_Sec
View(filter(AllDays_NewTravelDist,
            TravelTime_Sec == 300
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph2
              )
    )

# examples where TravelTime_Sec is unusually large (with TravelDistance_Mi values that are small, so SpeedAvg_Mph values are very small).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 2627459 & RowNum_OG <= 2627479) | # 2627469
                (RowNum_OG >= 2193344 & RowNum_OG <= 2193364) | # 2193354
                (RowNum_OG >= 1644123 & RowNum_OG <= 1644143) | # 1644133
                (RowNum_OG >= 869600 & RowNum_OG <= 869620) # 869610
           )
    )

Investigation of SpeedAvg_Mph2

View(Speed_Pctiles): 90% of SpeedAvg_Mph2 are between ~3mph and ~66mph.


Speed_Ntile <- as.data.frame(AllDays_NewTravelDist$SpeedAvg_Mph2) %>% 
  mutate(Pctile = ntile(AllDays_NewTravelDist$SpeedAvg_Mph2, 100),
         MinR = min_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
         PctR = percent_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(Speed_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_Ntile)

Speed_Ntile_Rows <- nrow(Speed_Ntile)

View(tail(Speed_Ntile, 500))


Speed_Pctiles <- group_by(Speed_Ntile,
                          PctR_Round
                         ) %>% 
  summarise(
    MinSpeedAtPctile = min(SpeedAvg_Mph2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / Speed_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(Speed_Pctiles)

Investigation of SpeedAvg_Mph2.

Exploring the removal of outlier TravelTime_Sec and TravelDistance_Mi.


summary(select(AllDays_NewTravelDist,
               SpeedAvg_Mph,
               SpeedAvg_Mph2
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      TravelDistance_Mi > 0.0001893939 & # lowest non-zero percentile
                        TravelDistance_Mi < 1.0812500000 & # 99th percentile
                        TravelTime_Sec > 10.050000 & # 2nd percentile
                        TravelTime_Sec < 293.000000 # 98th percentile
                     ),
               SpeedAvg_Mph,
               SpeedAvg_Mph2
              )
       )

Investigation of SpeedAvg_Mph2.

Histogram of SpeedAvg_Mph2.


Speed_HistDen <- ggplot(filter(AllDays_NewTravelDist,
                               !is.na(SpeedAvg_Mph2)
                              ),
                        aes(x = SpeedAvg_Mph2,
                            y = ..density..
                           )
                       ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  stat_bin(binwidth = 5,
           geom = "text",
           size = 2.5,
           vjust = 1.5,
           aes(label = format(..count.., big.mark = ",")
              ),
          ) +
  # geom_text(aes(label = format(..count.., big.mark = ",")
  #              ),
  #           size = 3,
  #           nudge_y = (..count.. * 0.1)
  #          ) +
  coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Speed",
       x = "Average Speed (mph)",
       y = "Density"
      )

Speed_HistDen

Investigation of SpeedAvg_Mph2.

Histogram of SpeedAvg_Mph2 after removing outlier TravelTime_Sec and TravelDistance_Mi.


View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)

SpeedNoOutlier_HistDen <- ggplot(filter(AllDays_NewTravelDist,
                                        !is.na(SpeedAvg_Mph2) &
                                          TravelDistance_Mi_New > 0.077841005 & # 5th percentile
                                          # TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
                                          TravelTime_Sec > 12.100000 # 4th percentile
                                          # TravelTime_Sec < 293.000000 # 98th percentile
                                       ),
                                 aes(x = SpeedAvg_Mph2,
                                     y = ..density..
                                    )
                                ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  stat_bin(binwidth = 5,
           geom = "text",
           size = 2.5,
           vjust = 1.5,
           aes(label = format(..count.., big.mark = ",")
              ),
          ) +
  # geom_text(aes(label = format(..count.., big.mark = ",")
  #              ),
  #           size = 3,
  #           nudge_y = (..count.. * 0.1)
  #          ) +
  coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Speed",
       subtitle = "(removed low outliers of Travel Distance and Travel Time)",
       x = "Average Speed (mph)",
       y = "Density"
      )

SpeedNoOutlier_HistDen

Investigation of SpeedAvg_Mph2.

New dataset (NoOutliers_TravelDistNTime) when removing outlier low values of TravelDistance_Mi_New and TravelTime_Sec.


View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)

NoOutliers_TravelDistNTime <- filter(AllDays_NewTravelDist,
                                     TravelDistance_Mi_New > .077841005 & # 5th percentile
                                       # TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
                                       TravelTime_Sec > 12.100000 # 4th percentile
                                       # TravelTime_Sec < 293.000000 # 98th percentile
                                    )

nrow(AllDays_NewTravelDist) - nrow(NoOutliers_TravelDistNTime)

str(NoOutliers_TravelDistNTime)
summary(NoOutliers_TravelDistNTime)

Investigation of SppedAvg_Mph2.

View(Speed_NoOut_Pctiles): Aproximately 90% of SpeedAvg_Mph2 values are between ~4mph and ~56mph.


Speed_NoOut_Ntile <- as.data.frame(NoOutliers_TravelDistNTime$SpeedAvg_Mph2) %>% 
  mutate(Pctile = ntile(NoOutliers_TravelDistNTime$SpeedAvg_Mph2, 100),
         MinR = min_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
         PctR = percent_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(Speed_NoOut_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_NoOut_Ntile)

Speed_NoOut_Ntile_Rows <- nrow(Speed_NoOut_Ntile)

View(tail(Speed_NoOut_Ntile, 500))


Speed_NoOut_Pctiles <- group_by(Speed_NoOut_Ntile,
                                PctR_Round
                               ) %>% 
  summarise(
    MinSpeedAtPctile = min(SpeedAvg_Mph2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / Speed_NoOut_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(Speed_NoOut_Pctiles)

Investigation of SppedAvg_Mph2.

Exloring odd/impossible values.


# Exploring when SpeedAvg_Mph2 is NA  --  does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
            is.na(SpeedAvg_Mph2)
           )
    )


# Exploring when SpeedAvg_Mph2 is zero  --  does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
            SpeedAvg_Mph2 == 0
           )
    )


# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
            SpeedAvg_Mph2 > 0 &
              SpeedAvg_Mph2 < 3.2848770
           ) %>% 
       arrange(SpeedAvg_Mph2)
    )

# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 485338 & RowNum_OG <= 485358) | # 485348  --  Extreme travel time, Route Change
                (RowNum_OG >= 346952 & RowNum_OG <= 346972) | # 346962  -- Extreme travel time, Route Change 
                (RowNum_OG >= 70494 & RowNum_OG <= 70514) | # 70504  --  Extreme travel time, Route Change
                (RowNum_OG >= 2051846 & RowNum_OG <= 2051866) # 2051856  --  Extreme travel time, Route Change
           )
    )

Investigation of SpeedAvg_Mph2.

Limit the dataset based on SpeedAvg_Mph2.


NoOutliersSpeed <- filter(NoOutliers_TravelDistNTime,
                          between(SpeedAvg_Mph2,
                                  4.069300, # 5th percentile
                                  56.05651 #95th percentile
                                 )
                          )

nrow(NoOutliers_TravelDistNTime) - nrow(NoOutliersSpeed)

summary(NoOutliersSpeed)

TravelTime now looks like it has some odd values on the high end. So let’s look at those.

View(TravTime_NoOut_Pctiles): Virtually all trips should take less than 5 minutes. (The 99th percentile of of TravelTime is approximately 8 minutes.)


TravTime_NoOut_Ntile <- as.data.frame(NoOutliersSpeed$TravelTime_Hr) %>% 
  mutate(Pctile = ntile(NoOutliersSpeed$TravelTime_Hr, 100),
         MinR = min_rank(NoOutliersSpeed$TravelTime_Hr),
         PctR = percent_rank(NoOutliersSpeed$TravelTime_Hr),
         PctR_Round = round(PctR, 2)
        )

colnames(TravTime_NoOut_Ntile)[1] <- "TravelTime_Hr"
str(TravTime_NoOut_Ntile)

TravTime_NoOut_Ntile_Rows <- nrow(TravTime_NoOut_Ntile)

View(tail(TravTime_NoOut_Ntile, 500))


TravTime_NoOut_Pctiles <- group_by(TravTime_NoOut_Ntile,
                                   PctR_Round
                                  ) %>% 
  summarise(
    MinTravTimeHrAtPctile = min(TravelTime_Hr),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravTime_NoOut_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile),
         MinTravTimeSecAtPctile = MinTravTimeHrAtPctile * (60 * 60)
        )

View(TravTime_NoOut_Pctiles)

Investigating odd TravelTime_Sec values.

Trips longer than ~8 minutes.


View(filter(NoOutliersSpeed,
            TravelTime_Sec > 491 # min at the 100th percentile
           ) %>% 
       arrange(desc(TravelTime_Sec)
              )
    )

# examples of TravelTime_Sec values that are largest.
View(filter(NoOutliersSpeed,
            (RowNum_OG >= 2071759 & RowNum_OG <= 2071779) | # 2071769  --  results from a route change, and a 3hr+ wait before the new route starts
                (RowNum_OG >= 1473686 & RowNum_OG <= 1473706) | # 1473696  --  results from a route change, and a 3hr wait before the new route starts
                (RowNum_OG >= 1222822 & RowNum_OG <= 1222842) | # 1222832  --  results from a route change, and a 3hr wait before the new route starts
                (RowNum_OG >= 3046089 & RowNum_OG <= 3046109) # 3046099  --  results from a route change, and a 3hr wait before the new route starts
           )
    )


# examples of TravelTime_Sec values that are the smallest of the large.
View(filter(NoOutliersSpeed,
            (RowNum_OG >= 3044689 & RowNum_OG <= 3044709) | # 3044699  --  results from a route change
                (RowNum_OG >= 3022358 & RowNum_OG <= 3022378) | # 3022368  --  results from a route change
                (RowNum_OG >= 2993016 & RowNum_OG <= 2993036) | # 2993026  --  results from a previous route change (change occurred in deleted row)
                (RowNum_OG >= 2683703 & RowNum_OG <= 2683723) # 2683713  --  results from a previous route change (change occurred in deleted row)
           )
    )

Let’s look at the TravelTime_Sec values and route changes (DirChange2).

The 99th percentile of TravelTime_Sec for both, all trips, and just those trips NOT involving route changes (DirChange2 = “Same”), is approximately 5min (300 sec).

Nota Bene: The percentile calculation here is defined slightly different than in most of the above analyses (which get the lowest value in the bin created by 100 ntiles).


summary(select(NoOutliersSpeed,
               TravelTime_Sec
              )
       )

summary(select(filter(NoOutliersSpeed,
                      DirChange2 == "Same"
                     ),
               TravelTime_Sec
              )
       )

summary(select(filter(NoOutliersSpeed,
                      DirChange2 == "Change"
                     ),
               TravelTime_Sec
              )
       )


TravTimeSec_Qtiles_df <- data.frame(PctValue = seq(0, 100, 1),
                                    All = seq(1, 101, 1),
                                    Same = seq(1, 101, 1),
                                    Change = seq(1, 101, 1)
                                   )

TravTimeSec_Qtiles_df[ , 2] <- quantile(select(NoOutliersSpeed,
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

TravTimeSec_Qtiles_df[ , 3] <- quantile(select(filter(NoOutliersSpeed,
                                                      DirChange2 == "Same"
                                                     ),
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

TravTimeSec_Qtiles_df[ , 4] <- quantile(select(filter(NoOutliersSpeed,
                                                      DirChange2 == "Change"
                                                     ),
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

View(TravTimeSec_Qtiles_df)

Limit the dataset now based on TravelTime_Sec.


UpperLimitTravTime <- filter(NoOutliersSpeed,
                             TravelTime_Sec <= 491 # min at the 100th percentile
                             )

nrow(NoOutliersSpeed) - nrow(UpperLimitTravTime)

str(UpperLimitTravTime)

summary(UpperLimitTravTime)

Investigation of Dwell_Time2 (how long the bus is at a stop).

Differences between Dwell_Time (by WMATA) and Dwell_Time2 (by me) appear to be due to switches in RouteAlt. WMATA calculates Dwell_Time by an unknown process. The WMATA calculation is equal to my calculation, except for the records immedaitely before and after a RouteAlt switch (DirChange2).


View(filter(AllDays_NewOrder,
            Dwell_Time != Dwell_Time2
           )
    )


# Examples where the Dwell_Time and Dwell_Time2 are different
View(filter(AllDays_NewOrder,
            ( (RowNum_OG >= 65 & RowNum_OG <= 85) | # 75
                (RowNum_OG >= 162 & RowNum_OG <= 192) | # 172
                (RowNum_OG >= 431952 & RowNum_OG <= 431972) | # 431962
                (RowNum_OG >= 434595 & RowNum_OG <= 434615) # 434605  --  this record is NOT a route switch, but does has a Sequence switch (Me: should there really be a route switch here?)
            )
           )
    )

Investigation of Dwell_Time2 (how long the bus is at a stop).

First, create some “rank” stats. View(DT2_Pctiles): 95% of Dwell_Time2s are <= 23 seconds…but some weird (e.g., nearly 2 hour Dwell_Time2s exist).


DwellTime2_Ntile <- as.data.frame(AllDays_NewOrder$Dwell_Time2) %>% 
  mutate(Pctile = ntile(AllDays_NewOrder$Dwell_Time2, 100),
         MinR = min_rank(AllDays_NewOrder$Dwell_Time2),
         PctR = percent_rank(AllDays_NewOrder$Dwell_Time2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DwellTime2_Ntile)[1] <- "Dwell_Time2"
str(DwellTime2_Ntile)

DwellTime2_Ntile_Rows <- nrow(DwellTime2_Ntile)

View(tail(DwellTime2_Ntile, 500))


DwellTime2_Pctiles <- group_by(DwellTime2_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinDwellAtPctile = min(Dwell_Time2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DwellTime2_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DwellTime2_Pctiles)

Investigation of Dwell_Time2 (how long the bus is at a stop).

Histogram of Dwell_Time2.


DwellTime2_HistDen <- ggplot(AllDays_NewOrder, aes(x = Dwell_Time2, y = ..density..)) +
  geom_histogram(binwidth = 1, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(1, 25), ylim = c(0, 0.05)
                 ) +
  xlab("Time a Bus Stays at a Stop (sec)") + 
  ylab("Density") + 
  #  theme(legend.position="none") + 
  ggtitle(expression(atop("Variation in How Long a Bus Stays at a Stop"
                          # ,atop(italic("xxxxx"),"")
                         )
                    )
         )

DwellTime2_HistDen

Investigation of Dwell_Time2 (how long the bus is at a stop).

Looking at some weirdly long Dwell_Time2 values.


View(arrange(AllDays_NewOrder,
             desc(Dwell_Time2)
            )
    )


# examples of extremely large Dwell_Time2s
View(filter(AllDays_NewOrder,
            (RowNum_OG >= 292669 & RowNum_OG <= 292689) | # 292679
                (RowNum_OG >= 531057 & RowNum_OG <= 531077) | # 531067
                (RowNum_OG >= 1388627 & RowNum_OG <= 1388647) | # 1388637
                (RowNum_OG >= 1645711 & RowNum_OG <= 1645731) # 1645721
           )
    )


View(filter(AllDays_NewOrder,
            Dwell_Time2 == 0
           )
    )

Investigation of Delta_Time (how early or late the bus is).

View(DT2_Pctiles): 94% of Delta_Time values are between -236 seconds and 1,259 seconds. Roughly 66% of records are within 5 min late and 5 min early…but some weird (e.g., almost 50 minute late or 40 minute early) Delta_Times exist.

Note that Delta_Time is the difference from the scheduled bus arrival. So if two buses are scheduled to arrive at a destination at 10:00pm and 10:20pm, and if the 10:20pm bus has a Delta_Time of 5 minutes, there are 25 minutes between bus arrivals at the stop.

Also note that based on a comment at https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/, the Delta_Time values don’t appear to coincide with published bus schedules (e.g., the X2 departing every 8 minutes during peak hours).


DeltTime_Ntile <- as.data.frame(AllDays_NewOrder$Delta_Time) %>% 
  mutate(Pctile = ntile(AllDays_NewOrder$Delta_Time, 100),
         MinR = min_rank(AllDays_NewOrder$Delta_Time),
         PctR = percent_rank(AllDays_NewOrder$Delta_Time),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DeltTime_Ntile)[1] <- "Delta_Time"
str(DeltTime_Ntile)

DeltTime_Ntile_Rows <- nrow(DeltTime_Ntile)

View(tail(DeltTime_Ntile, 500))


DeltTime_Pctiles <- group_by(DeltTime_Ntile,
                             PctR_Round
                            ) %>% 
  summarise(
    MinDeltTimeAtPctile = min(Delta_Time),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DeltTime_Pctiles)
DeltTime_Pctiles

# ~66% of rows are between 5 min late and 5 min early
nrow(filter(AllDays_NewOrder,
            Delta_Time >= -300 &
              Delta_Time <= 300
           )
    ) / nrow(AllDays_NewOrder)


# examples of weird large Delta_Times
View(filter(AllDays_NewOrder,
            Delta_Time < -4202 |
              Delta_Time > 1705
           ) %>% 
       arrange(desc(Delta_Time)
              )
    )

Investigation of Delta_Time (how early or late the bus is).

Delta_Time histogram.


DeltTime_HistDen <- ggplot(AllDays_NewOrder, aes(x = (Delta_Time / 60),
                                                 y = ..density..
                                                )
                          ) +
  geom_histogram(binwidth = (5/60), fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(-5, 5)) +
  xlab("Bus Lateness (min)") + 
  ylab("Density") + 
  #  theme(legend.position="none") + 
  ggtitle(expression(atop("Variation in How Early/Late a Bus Is",
                          atop(italic("(positive values are late arrivals)"),
                               ""
                              )
                         )
                    )
         )

DeltTime_HistDen

Investigation of Delta_Time (how early or late the bus is).

Delta_Time boxplot.


# Count_Values is needed to display the medians on the box plots
Count_Values <- ddply(AllDays_NewOrder,
                      .(Event_Time_HrGroup),
                      summarise,
                      Value_Counts = median(Delta_Time / 60, na.rm = TRUE)
                     )

DeltTime_BoxPlot <- ggplot(AllDays_NewOrder,
                           aes(factor(Event_Time_HrGroup),
                               Delta_Time / 60,
                               fill = factor(Event_Time_HrGroup)
                              )
                          ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE) + 
  # coord_cartesian(ylim = c(-300, 1200)) +
  coord_cartesian(ylim = c(-5, 20)) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  xlab("Hour Group") + 
  ylab("Bus Lateness (minutes)") + 
  theme(legend.position="none", axis.text.x = element_text(angle=45)) + 
  #theme(legend.position="right", axis.text.x = element_blank()) + 
  ggtitle(expression(atop("How Early/Late is the Bus (by Hour Group)",
                          atop(italic("(positive values are late arrivals)"),
                               ""
                              )
                         )
                    )
         )

DeltTime_BoxPlot

Investigation of Delta_Time (how early or late the bus is).

Exploring “extreme” Delta_Times. First let’s get some “rank” stats.


View(DeltTime_Pctiles)
DeltTime_Pctiles


DeltTimeAbs_Ntile <- as.data.frame(abs(AllDays_NewOrder$Delta_Time)) %>% 
  mutate(Pctile = ntile(abs(AllDays_NewOrder$Delta_Time), 100),
         MinR = min_rank(abs(AllDays_NewOrder$Delta_Time)),
         PctR = percent_rank(abs(AllDays_NewOrder$Delta_Time)),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DeltTimeAbs_Ntile)[1] <- "Delta_Time_Abs"
str(DeltTimeAbs_Ntile)

DeltTimeAbs_Ntile_Rows <- nrow(DeltTimeAbs_Ntile)

View(tail(DeltTimeAbs_Ntile, 500))


DeltTimeAbs_Pctiles <- group_by(DeltTimeAbs_Ntile,
                                PctR_Round
                               ) %>% 
  summarise(
    MinDeltTimeAtPctile = min(Delta_Time_Abs),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DeltTimeAbs_Pctiles)
DeltTimeAbs_Pctiles

Investigation of Delta_Time (how early or late the bus is).

Exploring “extreme” Delta_Times. Then let’s calculate the percentage of buses that are 10 minutes (or more) late/early.


HrGroup_DeltaTime_All <- group_by(AllDays_NewOrder,
                                  Event_Time_HrGroup
                                 ) %>% 
  summarise(EventAll_Cnt = n()
           )

str(HrGroup_DeltaTime_All)
View(HrGroup_DeltaTime_All)


HrGroup_DeltaTime_Above10Min <- filter(AllDays_NewOrder,
                                       abs(Delta_Time) >= 600
                                      ) %>% 
  group_by(Event_Time_HrGroup) %>% 
  summarise(EventAbove10_Cnt = n()
           )

str(HrGroup_DeltaTime_Above10Min)
View(HrGroup_DeltaTime_Above10Min)


HrGroup_DeltaTimeCompare <- inner_join(HrGroup_DeltaTime_Above10Min,
                                       HrGroup_DeltaTime_All,
                                       by = c("Event_Time_HrGroup" = "Event_Time_HrGroup")
                                      ) %>% 
  mutate(PctEventsAbove10 = EventAbove10_Cnt / EventAll_Cnt)

View(HrGroup_DeltaTimeCompare)

Investigation of Delta_Time (how early or late the bus is).

Quickly plot these “extreme” Delta_Times.


DeltTime_Above10_Cols <- ggplot(HrGroup_DeltaTimeCompare,
                                aes(factor(Event_Time_HrGroup),
                                    PctEventsAbove10
                                   )
                               ) +
  geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_text(aes(label = format(round(PctEventsAbove10, digits = 2),
                               nsmall = 2
                              )
               ),
            size = 3,
            nudge_y = (HrGroup_DeltaTimeCompare$PctEventsAbove10 * -0.1)
           ) +
  # coord_cartesian(xlim = c(-5, 5)) +
  xlab("Hour Group") + 
  ylab("Percent of All Bus Arrivals") +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  ggtitle(expression(atop("When is a Bus 10+ Minutes Late/Early"
                          # ,atop(italic("positive values are late arrivals"),
                          #      ""
                          #     )
                         )
                    )
         )

DeltTime_Above10_Cols

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Correlation.


DwellTDeltaT_Corr <- as.matrix(cor(x = AllDays_NewOrder$Dwell_Time2,
                                   y = AllDays_NewOrder$Delta_Time,
                                   use = "pairwise"
                                  )
                               )

DwellTDeltaT_Corr

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Next, let’s get a sample of data for plotting. Let’s do this for the full dataset (AllDays_NewOrder).


AllDays_NewOrder_10PctSamp <- sample_frac(AllDays_NewOrder, 0.1) %>% 
  select(Delta_Time,
         Dwell_Time2
        ) %>% 
  mutate(DataSet = "AllData")

str(AllDays_NewOrder_10PctSamp)

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Let’s also get a sample of data for plotting, but with a datset that removes outliers.


View(DeltTime_Pctiles)
View(DwellTime2_Pctiles)

AllDays_NewOrder_NoExtremes_10PctSamp <- filter(AllDays_NewOrder,
                                                between(Delta_Time, -402, 1705) & # removes about 2% of Delta_Time values
                                                  between(Dwell_Time2, 1, 63)  # removes about 2% of Dwell_Time2 values
                                               ) %>% 
  sample_frac(0.1) %>% 
  select(Delta_Time,
         Dwell_Time2
        ) %>% 
  mutate(DataSet = "OutliersRemoved")

str(AllDays_NewOrder_NoExtremes_10PctSamp)

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from the dataset that does not remove outliers.


DwellTDeltaT_Scatter <- ggplot(AllDays_NewOrder_10PctSamp,
                               aes(Dwell_Time2, Delta_Time)
                              ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "red") +
  # xlab("Time at Stop (sec)") + 
  # ylab("Lateness (sec)") +
  annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
                          y = AllDays_NewOrder_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_10PctSamp$Dwell_Time2
                         ),
           x = 2200,
           y = 600,
           geom = "text",
           size = 3,
           colour = "red",
           parse = TRUE
          ) +
  labs(title = "Lateness vs Time at Stop",
       subtitle = "(no outliers removed)",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
  #                         ,atop(italic("(no outliers removed)"),
  #                               ""
  #                              )
  #                        )
  #                   )
  #        )
# +
#   geom_jitter()

DwellTDeltaT_Scatter

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from the dataset that does remove outliers.


DwellTDeltaT_Scatter_NoExtremes <- ggplot(AllDays_NewOrder_NoExtremes_10PctSamp,
                                          aes(Dwell_Time2, Delta_Time)
                                         ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "blue") +
  # xlab("Time at Stop (sec)") + 
  # ylab("Lateness (sec)") +
  annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
                          y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
                         ),
           x = 50,
           y = -475,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  labs(title = "Lateness vs Time at Stop",
       subtitle = "(2% of outliers removed)",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
  #                         ,atop(italic("(2% of outliers removed)"),
  #                               ""
  #                              )
  #                        )
  #                   )
  #        )
# +
#   geom_jitter()

DwellTDeltaT_Scatter_NoExtremes

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from both datasets together.


CombinedData <- rbind(AllDays_NewOrder_10PctSamp,
                      AllDays_NewOrder_NoExtremes_10PctSamp
                     )

CombinedData$DataSet <- factor(CombinedData$DataSet)

str(CombinedData)


DwellTDeltaT_Scatter_Combined <- ggplot(CombinedData,
                                        aes(x = Dwell_Time2,
                                            y = Delta_Time,
                                            colour = DataSet
                                           )
                                       ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  coord_cartesian(xlim = c(0, 500), ylim = c(-1000, 2000)
                 ) +
  geom_smooth(data = filter(CombinedData,
                            DataSet == "AllData"
                           ),
              method = "lm",
              colour = "red"
             ) +
  geom_smooth(data = filter(CombinedData,
                            DataSet == "OutliersRemoved"
                           ),
              method = "lm",
              colour = "blue"
             ) +
  # facet_wrap( ~ DataSet, ncol = 2) +
  annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
                          y = AllDays_NewOrder_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_10PctSamp$Dwell_Time2
                         ),
           x = 300,
           y = -600,
           geom = "text",
           size = 3,
           colour = "red",
           parse = TRUE
          ) +
  annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
                          y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
                         ),
           x = 300,
           y = -800,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  theme(legend.position = "bottom") +
  labs(title = "Lateness vs Time at Stop",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
                          # ,atop(italic("2% of outliers removed"),
                          #       ""
                          #      )
         #                 )
         #            )
         # )
# +
#   geom_jitter()

DwellTDeltaT_Scatter_Combined

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

---
title: "R Notebook for WMATA Metrobus Data"
output:
  html_notebook: default
  html_document: default
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook for analysis using data on the DC Bus System (WMATA Metrobus).  The data were obtained here:

https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/


Load the packages to be used.
```{r echo = FALSE, message = FALSE}

library("jsonlite")           # manipulating JSON files for zip codes
library("sqldf")              # sql-based data manipulation
library("tcltk")
library("tidyr")              # data manipulation
library("plyr")               # data manipulation
library("dplyr")              # data manipulation
library("magrittr")           # data manipulation (piping data)
library("stringr")            # string manipulation
library("data.table")         # used in testing data manipulation for speed increases
library("lubridate")          # date manipulation
library("geosphere")          # calculating Haversine distance
library("ggplot2")            # general plotting
library("ggvis")              # general plotting
library("rbokeh")             # general plotting
library("ggmap")              # general plotting of maps
library("rgdal")              # used in plotting shapefiles
library("broom")              # used in plotting shapefiles
library("maptools")           # used in plotting shapefiles
library("rgeos")              # used in plotting shapefiles
library("caret")              # used in PCA
library("cluster")            # used for clustering
library("fpc")                # used for clustering
library("dbscan")             # used for clustering
library("NbClust")            # used for clustering
library("factoextra")         # plotting clusters
# library("gpclib")
# install.packages('rgeos', type='source')
# install.packages('rgdal', type='source')
# install.packages("NbClust")

```


Get the Bus data.

First let's check the working directory.
```{r}

getwd()

```


Then, actually get the data.
```{r echo = FALSE}

setwd("/Users/mdturse/Desktop/Analytics/DCMetroBus/Bus AVL Oct 2016")

for (i in 3:7){
  assign(paste0("Oct0", i, "Raw"),
         read.delim(paste0("2016100", i, "MetrobusAVL.txt"),
                    sep = "\t",
                    header = TRUE,
                    na.strings = NULL
                   )
        )
  
  message("Oct0", i, "Raw")
 
  str(get(paste0("Oct0", i, "Raw")
         )
     )
  }

```


Put the daily data together.
```{r}

AllDays <- bind_rows(list(Oct03Raw, Oct04Raw, Oct05Raw, Oct06Raw, Oct07Raw),
                     .id = c("group")
                    )
# dim(AllDays)
str(AllDays)

```


Deleting old data frames.
```{r}

for (i in 3:7){
  rm(list = ls(pattern = paste0("Oct0", i, "Raw")
              )
    )
  
  message("Deleting Oct0", i, "Raw")
  }

```


Updating variable types.

Then, sorting the data and adding a RowNumber (to be used for identifying rows later in the analyses.)
```{r}

rm(i)


AllDays$group <- factor(AllDays$group)
AllDays$Route_Direction <- factor(AllDays$Route_Direction)
AllDays$Event_Time <- as.POSIXct(AllDays$Event_Time, format = "%m-%d-%y %I:%M:%S %p")
AllDays$Departure_Time <- as.POSIXct(AllDays$Departure_Time, format = "%m-%d-%y %I:%M:%S %p")

str(AllDays)


AllDays_Sorted <- arrange(AllDays,
                          Bus_ID,
                          Event_Time
                         ) %>% 
  mutate(RowNum_OG = row_number() # this is useful in identify the row later on
        )

rm(AllDays)
str(AllDays_Sorted)

# View(head(AllDays_Sorted, 100))

```


Inspecting the values of Stop_ID, and finding that it can take the values "" (blank) and "NULL".
```{r}

View(group_by(AllDays_Sorted,
              Stop_ID
             ) %>% 
       summarise(
         Cnt = n()
         ) %>% 
       arrange(Stop_ID)
    )

View(filter(AllDays_Sorted,
            is.na(Stop_ID) |
              Stop_ID == "" |
              Stop_ID == "NULL"
           ) %>% 
       arrange(Stop_Desc)
    )

```


Creating a table of distinct Stop_Desc values when Stop_ID is "" (blank) or "NULL".
```{r}

StopID_New <- filter(AllDays_Sorted,
                     is.na(Stop_ID) |
                       Stop_ID == "" |
                       Stop_ID == "NULL"
                    ) %>% 
  select(Stop_ID, Stop_Desc) %>% 
  distinct() %>% 
  arrange(Stop_ID, Stop_Desc) %>% 
  mutate(StopID_New = 1:nrow(.)
        )

View(StopID_New)
StopID_New

```


Creating a full updated table by filling in StopID_New for when Stop_ID is "" (blank) or NULL.
```{r}

AllDays_StopIDNew <- left_join(AllDays_Sorted,
                               select(StopID_New,
                                      Stop_Desc,
                                      StopID_New
                                     ),
                               by = c("Stop_Desc" = "Stop_Desc")
                              ) %>% 
  mutate(StopID_Clean = ifelse(is.na(StopID_New),
                               Stop_ID,
                               StopID_New
                              ),
         StopID_Indicator = factor(ifelse(is.na(StopID_New),
                                          "ID_OK",
                                          "ID_Bad"
                                         )
                                  )
        )

rm(StopID_New)
rm(AllDays_Sorted)
str(AllDays_StopIDNew)

# View(tail(AllDays_StopIDNew, 500))
# View(filter(AllDays_StopIDNew,
#             Stop_Desc == "METROWAY ANNNOUCEMNT CORR"
#            )
#     )

```


Lat Long stats for pulling in Zip codes later.
```{r}

LL_Stats <- group_by(AllDays_StopIDNew,
                     StopID_Clean
                    ) %>% 
  summarise(Lat_Mean = mean(Latitude, na.rm = TRUE),
            Lat_Med = median(Latitude, na.rm = TRUE),
            Lng_Mean = mean(Longitude, na.rm = TRUE),
            Lng_Med = median(Longitude, na.rm = TRUE)
           ) %>% 
  mutate(Lat_MeaLessMed = Lat_Mean - Lat_Med,
         Lng_MeaLessMed = Lng_Mean - Lng_Med,
         RowNum = row_number()
        )

str(LL_Stats)
summary(LL_Stats)

View(head(arrange(LL_Stats,
                  Lat_MeaLessMed
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  desc(Lat_MeaLessMed)
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  Lng_MeaLessMed
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  desc(Lng_MeaLessMed)
                 ),
          500
         )
    )

```


Pulling in Zip Code data from api.geonames.org.
```{r}

# URL EXAMPLE:
# http://api.geonames.org/findNearbyPostalCodesJSON?lat=38.89560&lng=-76.94873&radius=0&username=supermdat

url_1 <- "http://api.geonames.org/findNearbyPostalCodesJSON?lat="
url_2 <- "&lng="
url_3 <- "&radius=0&username="
username <- "supermdat"


# need to group in bunches as http://api.geonames.org limits pulls to 2000 per hour


##### Store everything in multiple lists
pages1 <- list()


system.time(

for(i in 1:1000){
  lat <- filter(LL_Stats,
                RowNum == i
               ) %>%
    select(Lat_Med)
  
  lng <- filter(LL_Stats,
                RowNum == i
               ) %>%
    select(Lng_Med)
  
  APIData1 <- fromJSON(paste0(url_1,
                              lat,
                              url_2,
                              lng,
                              url_3,
                              username
                             ),
                       flatten = TRUE
                      )
  
  message("Retrieving Zip Code ", i)
  
  pages1[[i]] <- APIData1$postalCodes
  
}
)


##### Combine the lists into one page
Zips1 <- rbind.pages(pages1[sapply(pages1, length) > 0])


##### Combine all pages
Zips_All <- bind_rows(Zips0,
                      Zips1,
                      Zips2,
                      Zips3,
                      Zips4,
                      Zips5,
                      Zips6,
                      Zips7,
                      Zips8,
                      Zips9,
                      Zips10,
                      .id = "id"
                     ) %>% 
  mutate(UniqueLatLng = paste(lat, lng, sep = "__")
        )

# str(Zips_All)
# View(head(Zips_All))


# str(LL_Stats)
LL_Stats_UnqLatLng <- mutate(LL_Stats,
                             UniqueLatLng = paste(Lat_Med, Lng_Med, sep = "__")
                            )

# str(LL_Stats_UnqLatLng)
# View(head(LL_Stats_UnqLatLng))


LL_StatsZips <- left_join(LL_Stats_UnqLatLng,
                          Zips_All,
                          by = c("UniqueLatLng" = "UniqueLatLng")
                         )

str(LL_StatsZips)
# View(head(LL_StatsZips))

# Not sure whey these couldn't be found (why they're NA)
View(filter(LL_StatsZips,
            is.na(postalCode)
           )
    )

```


Join to create one dataset that also includes Zip variables.
```{r}

rm(url_1, url_2, url_3, username, pages0, pages1, pages2, pages3, pages4, pages5, pages6, pages7, pages8, pages9, pages10, i, lat, lng, APIData0, APIData1, APIData2, APIData3, APIData4, APIData5, APIData6, APIData7, APIData8, APIData9, APIData10, LL_Stats, LL_Stats_UnqLatLng)


AllDays_Zips <- left_join(AllDays_StopIDNew,
                          LL_StatsZips,
                          by = c("StopID_Clean" = "StopID_Clean")
                         ) %>% 
  rename(Stop_State = adminCode1,
         Stop_County = adminName2,
         Stop_City = placeName,
         Stop_Zip = postalCode
         )

rm(AllDays_StopIDNew, LL_StatsZips)
str(AllDays_Zips)

```


Updating variable types.
```{r}

AllDays_Zips$Stop_State <- factor(AllDays_Zips$Stop_State)
AllDays_Zips$Stop_County <- factor(AllDays_Zips$Stop_County)
AllDays_Zips$Stop_Zip <- factor(AllDays_Zips$Stop_Zip)
AllDays_Zips$Stop_City <- factor(AllDays_Zips$Stop_City)

AllDays_Zips$distance <- as.numeric(AllDays_Zips$distance)
AllDays_Zips$countryCode <- factor(AllDays_Zips$countryCode)
AllDays_Zips$adminName1 <- factor(AllDays_Zips$adminName1)

str(AllDays_Zips)

```


Feature engineering.

Inspecting incidences of consecutive Stop_IDs. This is done because investigation showed that many conseutive events occurr at the same Stop_ID, but with various Dwell_Times, Odometer_Distances, etc.  All of which affect calculations and analyses.

Create data on the runs (consecutive Stop_IDs).
```{r}

StopID_Runs <- rle(AllDays_Zips$StopID_Clean)

StopID_Runs$ends <- cumsum(StopID_Runs$lengths)

StopID_Runs$starts <- ifelse(is.na(lag(StopID_Runs$ends)
                                  ),
                             1,
                             lag(StopID_Runs$ends) + 1
                            )

str(StopID_Runs)
# class(StopID_Runs)
# 
# StopID_Runs_df <- data.frame(unclass(StopID_Runs))
# str(StopID_Runs_df)
# class(StopID_Runs_df)
# rm(StopID_Runs_df)

```


Trying to link data on RunsGroups with the original data (AllDays_Sorted). The goal is to select only one record per RunsGroup - that being the record with the longest Dwell_Time.

I attempted this computation using both data.frames (dplyr) and data.tables (data.table). However, with 2,809,062 rows in one dataset and 3,119,443 rows in the other dataset, the current computation time is over 5 days...so I'm trying a different strategy to only select the first record in a run.
```{r}

# Create a RunsGroup variable for each run
# StopID_Runs_df$RunsGroup <- paste0("g", seq(1:nrow(StopID_Runs_df)
#                                            )
#                                   )
# 
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
# 
# StopID_Runs_df <- StopID_Runs_df %>% 
#   mutate(RowNum = row_number()
#         )
# 
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
# 
# 
# # Converting to data.tables for, hopefully, improved performance (speed) in computation
# StopID_Runs_dt <- data.table(StopID_Runs_df)
# setkey(StopID_Runs_dt, RowNum)
# str(StopID_Runs_dt)
# 
# AllDays_Sorted_dt <- data.table(AllDays_Sorted)
# setkey(AllDays_Sorted_dt, RowNum_OG)
# str(AllDays_Sorted_dt)
# # rm(AllDays_Sorted_dt)
# 
# 
# # Actual loop to perform the computations and link to original data (AllDays_Sorted_dt)
# GroupData <- list()
# for(i in 1:nrow(StopID_Runs_dt)
#    ) {
#   assign(paste0("group_", i),
#            StopID_Runs_dt[RowNum == i, RunsGroup]
#           )
# 
#     #####  The code below is the same code as above, but done with dplyr  #####
# 
#     # assign(paste0("group_", i),
#   #        filter(StopID_Runs_df,
#   #               RowNum == i
#   #              ) %>% 
#   #          select(RunsGroup)
#   #       )
# 
#   assign(paste0("group_", i, "_start"),
#          StopID_Runs_dt[RowNum == i, starts]
#         )
# 
#   assign(paste0("group_", i, "_end"),
#          StopID_Runs_dt[RowNum == i, ends]
#         )
# 
#   assign(paste0("group_", i, "_rows"),
#          AllDays_Sorted_dt[RowNum_OG >= as.numeric(get(paste0("group_", i, "_start")
#                                                       )
#                                                   ) &
#                            RowNum_OG <= as.numeric(get(paste0("group_", i, "_end")
#                                                       )
#                                                   ),
#                            RunsGroup := as.character(get(paste0("group_", i)
#                                                         )
#                                                     )
#                           ]
# 
#     #####  The code below is the same as the code above, but done with dplyr  #####
# 
#          # filter(AllDays_Sorted,
#          #        between(RowNum_OG,
#          #                as.numeric(get(paste0("group_", i, "_start")
#          #                              )
#          #                          ),
#          #                as.numeric(get(paste0("group_", i, "_end")
#          #                              )
#          #                          )
#          #               )
#          #       ) %>% 
#          #   mutate(RunsGroup = as.character(get(paste0("group_", i)
#          #                                     )
#          #                                 )
#          #        )
#         )
# 
#   GroupData[[i]] <- get(paste0("group_", i, "_rows"))
# 
#   message("Processing Group ", i, " of 2,809,062")
# }
# 
# 
# GroupData_df <- rbind.fill(GroupData)
# str(GroupData_df)
# head(GroupData_df)
# tail(GroupData_df)
# # rm(GroupData_df)
# 
# 
# group_1
# group_1_start
# group_1_end
# group_1_rows
# group_2_rows
# group_3_rows
# group_50_rows
# str(group_50_rows)
# group_2809062_rows
# GroupData[[1]]
# GroupData[[50]]
# 
# 
# #####  Testing Area (Below)  #####
# #####  Testing Area (Below)  #####
# #####  Testing Area (Below)  #####
# 
# # head(StopID_Runs$starts, 20)
# # head(AllDays_NewOrder$Stop_ID, 20)
# # 
# # 
# # dat <- as.data.frame(c(1,1,7,7,7,9,6,8,2,2,2,1,1,1,1,1))
# # colnames(dat)[1] <- "dat"
# # r <- rle(dat$dat)
# # dat$run <- rep(r$lengths, r$lengths)
# # dat$runLag <- lag(dat$run)
# # dat$cond <- rep(r$values, r$lengths)
# # dat
# # View(dat)

```


When consecutive Stop_ID occurrs, only take the first occurrence. This is done because the computation time to select only the record with the longest Dwell_Time for each run was too long (over 5 days).

This is probably less than ideal with regards to Dwell_Time, but should not make much difference for calculations of travel time, speed, etc.
```{r}

AllDays_FirstStopID <- AllDays_Zips[StopID_Runs$starts, ]

dim(AllDays_Zips)
dim(AllDays_FirstStopID)

nrow(AllDays_Zips) - nrow(AllDays_FirstStopID)

rm(AllDays_Zips, StopID_Runs)
str(AllDays_FirstStopID)

```


Feature engineering.

Creating new variables.
```{r}

AllDays_AddVars <- mutate(AllDays_FirstStopID,
                          Odometer_Distance_Mi = Odometer_Distance / 5280, #5,280 feet in 1 mile
                          Dwell_Time2 = as.numeric(Departure_Time - Event_Time),
                          Event_Time_Yr = as.integer(year(Event_Time)),
                          Event_Time_Mth = as.integer(month(Event_Time)),
                          Event_Time_Date = day(Event_Time),
                          Event_Time_Day = wday(Event_Time, label = TRUE),
                          Event_Time_Hr = hour(Event_Time),
                          Event_Time_Min = minute(Event_Time),
                          Event_Time_HrGroup = factor(ifelse(Event_Time_Hr < 3,
                                                             "Group0_2",
                                                      ifelse(Event_Time_Hr < 6,
                                                             "Group3_5",
                                                      ifelse(Event_Time_Hr < 9,
                                                             "Group6_8",
                                                      ifelse(Event_Time_Hr < 12,
                                                             "Group9_11",
                                                      ifelse(Event_Time_Hr < 15,
                                                             "Group12_14",
                                                      ifelse(Event_Time_Hr < 18,
                                                             "Group15_17",
                                                      ifelse(Event_Time_Hr < 21,
                                                             "Group18_20",
                                                      ifelse(Event_Time_Hr < 24,
                                                             "Group21_23"
                                                            )))))))),
                                                         levels = c("Group0_2",
                                                                    "Group3_5",
                                                                    "Group6_8",
                                                                    "Group9_11",
                                                                    "Group12_14",
                                                                    "Group15_17",
                                                                    "Group18_20",
                                                                    "Group21_23"
                                                                   ),
                                                         ordered = TRUE
                                                     )
                         )

rm(AllDays_FirstStopID)
str(AllDays_AddVars)

```


Function for calculating the distance traveled based on the Haversine formula.  Original code from: https://www.r-bloggers.com/great-circle-distance-calculations-in-r/
```{r}

# Calculates the geodesic distance between two points specified by radian latitude/longitude using the Haversine formula (hf)
# gcd.hf <- function(long1, lat1, long2, lat2) {
#   R <- 6371 # Earth mean radius [km]
#   delta.long <- (long2 - long1)
#   delta.lat <- (lat2 - lat1)
#   a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
#   c <- 2 * asin(min(1,sqrt(a)))
#   d = R * c * 0.621371 # 1 km = 0.621371 miles
#   return(d) # Distance in miles
# }

```


Feature engineering.

Creating more variables. Creating a BusEvent row number for future identification purposes. Then, creating various variables to analyze distance traveled and speed.
```{r}

AllDays_BusDay <- group_by(AllDays_AddVars,
                           Bus_ID,
                           Event_Time_Date
                          ) %>% 
  mutate(BusDay_EventNum = row_number(),  # used to identify Bus movements on a particular date
         
         Route_Lag1 = lag(Route),  # used in future analyses to identify Route changes
         RouteAlt_Lag1 = lag(RouteAlt),  # used in future analyses to identify RouteAlt (direction) changes
         
         Odometer_Distance_Lag1 = lag(Odometer_Distance),
         
         Latitude_L1 = lag(Latitude),
         Longitude_L1 = lag(Longitude),
         # Lat_Radian = Latitude*pi/180,
         # Long_Radian = Longitude*pi/180,
         # Lat_Radian_L1 = lag(Lat_Radian),
         # Long_Radian_L1 = lag(Long_Radian),
         
         # accounting for potential negative distances
         TravelDistance_Ft = ifelse(Odometer_Distance > Odometer_Distance_Lag1,
                                    Odometer_Distance - Odometer_Distance_Lag1,
                                    NA
                                   ),
         TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
         
         # TravelDistance_Mi2 = gcd.hf(long1 = Long_Radian_L1,
         #                             lat1 = Lat_Radian_L1,
         #                             long2 = Long_Radian,
         #                             lat2 = Lat_Radian
         #                            ),
         
         TravelDistance_Mi_Hvrs = 
                              # ifelse((is.na(Longitude_L1) | is.na(Latitude_L1)
                              #        ),
                              #        NA,
                              distHaversine(cbind(Longitude_L1, Latitude_L1),
                                            cbind(Longitude, Latitude)
                                           ) * 0.000621371, # 0.000621371 miles = 1 meter
         
         # accounting for potential negative times
         TravelTime_Sec = as.numeric(ifelse(Event_Time > lag(Departure_Time),
                                            Event_Time - lag(Departure_Time),
                                            NA
                                           )
                                    ),
         TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
         
         # accounting for potential negative or zero travel times
         SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
                               TravelDistance_Mi / TravelTime_Hr,
                               NA
                              ),
         
         Start_ID = lag(StopID_Clean),
         Start_Desc = lag(Stop_Desc),
         StartStop_ID = ifelse(is.na(Start_ID),
                               paste("NULL", StopID_Clean, sep = "--"),
                               paste(Start_ID, StopID_Clean, sep = "--")
                              )
        ) %>% 
  as.data.frame()


rm(AllDays_AddVars)
str(AllDays_BusDay)

# summary(AllDays_BusDay)

# View(tail(AllDays_BusDay, 50))

```


Inspecting for issues with StartStop_ID (where the value is either NA or contains NULL). They ONLY exist when BusDay_EventNum = 1 (which is by design). So everything looks OK.
```{r}

View(group_by(AllDays_BusDay,
              StartStop_ID
             ) %>% 
       summarise(
         Cnt = n()
       ) %>% 
       arrange(desc(Cnt)
              )
    )

View(filter(AllDays_BusDay,
            (is.na(StartStop_ID) |
              str_detect(StartStop_ID, "NULL")
            ) &
              BusDay_EventNum != 1
           )
    )

```


Stats (quantiles) overall for TravelDistance_Mi.
```{r}

Quantiles_dt <- AllDays_BusDay %>% 
  mutate(TD_Mi_q2 = quantile(x = TravelDistance_Mi, probs = 0.02, na.rm = TRUE),
         TD_Mi_q98 = quantile(x = TravelDistance_Mi, probs = 0.98, na.rm = TRUE),
         TT_Sec_q2 = quantile(x = TravelTime_Sec, probs = 0.02, na.rm = TRUE),
         TT_Sec_q98 = quantile(x = TravelTime_Sec, probs = 0.98, na.rm = TRUE),
         TT_Hr_q2 = quantile(x = TravelTime_Hr, probs = 0.02, na.rm = TRUE),
         TT_Hr_q98 = quantile(x = TravelTime_Hr, probs = 0.98, na.rm = TRUE)
        ) %>% 
  data.table()


Stats <- Quantiles_dt %>% 
  mutate(TD_Mi_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_Mean_F = mean(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
                             na.rm = TRUE
                            ),
         TD_Mi_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_Med_F = median(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
                              na.rm = TRUE
                             ),
         TD_Mi_Cnt = sum(!is.na(TravelDistance_Mi)
                        ),
         TD_Mi_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98]
                                 )
                          ),
            
         TT_Sec_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_Mean_F = mean(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
                              na.rm = TRUE
                             ),
         TT_Sec_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_Med_F = median(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
                               na.rm = TRUE
                              ),
         TT_Sec_Cnt = sum(!is.na(TravelTime_Sec)
                         ),
         TT_Sec_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98]
                                   )
                           ),

         TT_Hr_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_Mean_F = mean(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
                             na.rm = TRUE
                            ),
         TT_Hr_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_Med_F = median(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
                              na.rm = TRUE
                             ),
         TT_Hr_Cnt = sum(!is.na(TravelTime_Hr)
                        ),
         TT_Hr_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98]
                                 )
                          )
        ) %>% 
  data.frame()

rm(AllDays_BusDay)
rm(Quantiles_dt)
str(Stats)
# View(head(Stats, 50))

```


Stats for StartStop_ID.
```{r}

Quantiles_SS_dt <- group_by(Stats,
                            StartStop_ID
                           ) %>% 
  mutate(TD_Mi_SS_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
         TD_Mi_SS_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
         TT_Sec_SS_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
         TT_Sec_SS_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
         TT_Hr_SS_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
         TT_Hr_SS_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
        ) %>% 
  data.table()


Stats_StSt <- group_by(Quantiles_SS_dt,
                       StartStop_ID
                      ) %>% 
  mutate(TD_Mi_SS_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SS_Mean_F = mean(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
                                na.rm = TRUE
                               ),
         TD_Mi_SS_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SS_Med_F = median(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
                                 na.rm = TRUE
                                ),
         TD_Mi_SS_Cnt = sum(!is.na(TravelDistance_Mi)
                           ),
         TD_Mi_SS_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95]
                                    )
                             ),
            
         TT_Sec_SS_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SS_Mean_F = mean(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
                                 na.rm = TRUE
                                ),
         TT_Sec_SS_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SS_Med_F = median(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
                                  na.rm = TRUE
                                 ),
         TT_Sec_SS_Cnt = sum(!is.na(TravelTime_Sec)),
         TT_Sec_SS_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95]
                                     )
                              ),

         TT_Hr_SS_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SS_Mean_F = mean(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
                                na.rm = TRUE
                               ),
         TT_Hr_SS_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SS_Med_F = median(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
                                 na.rm = TRUE
                                ),
         TT_Hr_SS_Cnt = sum(!is.na(TravelTime_Hr)),
         TT_Hr_SS_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95]
                                    )
                             )
        ) %>% 
  data.frame()

rm(Stats)
rm(Quantiles_SS_dt)
str(Stats_StSt)
# View(head(Stats_StSt, 50))

```


Stats for StartStop_ID with Event_Time_HrGroup.
```{r}

Quantiles_SSHG_dt <- group_by(Stats_StSt,
                              StartStop_ID,
                              Event_Time_HrGroup
                             ) %>% 
  mutate(TD_Mi_SSHG_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
         TD_Mi_SSHG_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
         TT_Sec_SSHG_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
         TT_Sec_SSHG_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
         TT_Hr_SSHG_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
         TT_Hr_SSHG_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
        ) %>% 
  data.table()


Stats_StSt_HrGrp <- group_by(Quantiles_SSHG_dt,
                             StartStop_ID,
                             Event_Time_HrGroup
                            ) %>% 
  mutate(TD_Mi_SSHG_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SSHG_Mean_F = mean(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
                                  na.rm = TRUE
                                 ),
         TD_Mi_SSHG_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SSHG_Med_F = median(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TD_Mi_SSHG_Cnt = sum(!is.na(TravelDistance_Mi)
                             ),
         TD_Mi_SSHG_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95]
                                      )
                               ),
            
         TT_Sec_SSHG_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SSHG_Mean_F = mean(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TT_Sec_SSHG_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SSHG_Med_F = median(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
                                    na.rm = TRUE
                                   ),
         TT_Sec_SSHG_Cnt = sum(!is.na(TravelTime_Sec)),
         TT_Sec_SSHG_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95]
                                       )
                                ),

         TT_Hr_SSHG_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SSHG_Mean_F = mean(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
                                  na.rm = TRUE
                                 ),
         TT_Hr_SSHG_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SSHG_Med_F = median(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TT_Hr_SSHG_Cnt = sum(!is.na(TravelTime_Hr)),
         TT_Hr_SSHG_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95]
                                      )
                               )
        ) %>% 
  data.frame()

rm(Stats_StSt)
rm(Quantiles_SSHG_dt)
str(Stats_StSt_HrGrp)
# View(head(Stats_StSt_HrGrp, 50))

```


Feature engineering.

Creating a BusEventRoute row number, and a RouteAlt_Lag1 indicator for future identification purposes. 
```{r}

# rm(Quantiles_dt)
# rm(Quantiles_SS_dt)
# rm(AllDays_BusDay)
# rm(Quantiles_SSHG_dt)
# rm(Stats_StSt)

# AllDays_BusDayRoute <- group_by(Stats_StSt_HrGrp,
#                                 Bus_ID,
#                                 Event_Time_Date,
#                                 Route
#                                ) %>% 
#   mutate(RouteAlt_Lag2 = lag(RouteAlt)  # used in future analyses to identify RouteAlt (direction) changes
#          
#          # Odometer_Distance_Lag1 = lag(Odometer_Distance),
#          # 
#          # # accounting for potential negative distances
#          # TravelDistance_Ft = ifelse(Odometer_Distance >= Odometer_Distance_Lag1,
#          #                            Odometer_Distance - Odometer_Distance_Lag1,
#          #                            NA
#          #                           ),
#          # TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
#          # 
#          # # accounting for potential negative times
#          # TravelTime_Sec = as.numeric(ifelse(Event_Time >= lag(Departure_Time),
#          #                                    Event_Time - lag(Departure_Time),
#          #                                    NA
#          #                                   )
#          #                            ),
#          # TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
#          # 
#          # # accounting for potential negative or zero travel times
#          # SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
#          #                       TravelDistance_Mi / TravelTime_Hr,
#          #                       NA
#          #                      )
#         ) %>% 
#   data.frame()
# 
# rm(Stats_StSt_HrGrp)
# str(AllDays_BusDayRoute)

```


Feature engineering.

Calculating a variable to know if the RouteAlt changed. Could be useful in helping identifying weirdness in calculated distances and speeds.
```{r}

# rm(Stats_StSt_HrGrp)

AllDays_DirChange <- Stats_StSt_HrGrp %>%  # AllDays_BusDayRoute %>% 
  mutate(RteChange = ifelse(Route == Route_Lag1,
                            "Same",
                            "Change"
                           ),
         RteChange2 = factor(ifelse(is.na(RteChange),
                                    "Change",
                                    RteChange
                                   )
                            ),
         DirChange = ifelse(RouteAlt == RouteAlt_Lag1,
                            "Same",
                            "Change"
                           ),
         DirChange2 = factor(ifelse(is.na(DirChange),
                                    "Change",
                                    DirChange
                                   )
                            )
        )

# rm(AllDays_BusDayRoute)
rm(Stats_StSt_HrGrp)
str(AllDays_DirChange)

View(filter(AllDays_DirChange,
            between(RowNum_OG, 2570060, 2570080)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )

```


Re-ordering the variables to ease with comprehension.
```{r}

AllDays_NewOrder <-  select(AllDays_DirChange,
                            RowNum_OG,
                            UniqueLatLng,
                            group,
                            StartStop_ID,
                            BusDay_EventNum,
                            Bus_ID,
                            Route,
                            RteChange2,
                            RouteAlt,
                            # RouteAlt_Lag1,
                            DirChange2,
                            Route_Direction,
                            Stop_Sequence,
                            Start_ID,
                            Start_Desc,
                            # Stop_ID,
                            StopID_Clean,
                            StopID_Indicator,
                            Stop_Desc,
                            countryCode,
                            Stop_State,
                            Stop_County,
                            Stop_City,
                            Stop_Zip,
                            Event_Type,
                            Event_Description,
                            Event_Time_Yr,
                            Event_Time_Mth,
                            Event_Time_Date,
                            Event_Time_Day,
                            Event_Time_Hr,
                            Event_Time_HrGroup,
                            Event_Time_Min,
                            Event_Time,
                            Departure_Time,
                            Dwell_Time,
                            Dwell_Time2,
                            Delta_Time,
                            Latitude,
                            Longitude,
                            Heading,
                            Odometer_Distance,
                            Odometer_Distance_Lag1,
                            Odometer_Distance_Mi,
                            TravelDistance_Ft,
                            TravelDistance_Mi,
                            TravelDistance_Mi_Hvrs,
                            TD_Mi_q2,
                            TD_Mi_q98,
                            TD_Mi_SS_q5,
                            TD_Mi_SS_q95,
                            TD_Mi_SSHG_q5,
                            TD_Mi_SSHG_q95,
                            TD_Mi_Mean,
                            TD_Mi_Mean_F,
                            TD_Mi_SS_Mean,
                            TD_Mi_SS_Mean_F,
                            TD_Mi_SSHG_Mean,
                            TD_Mi_SSHG_Mean_F,
                            TD_Mi_Med,
                            TD_Mi_Med_F,
                            TD_Mi_SS_Med,
                            TD_Mi_SS_Med_F,
                            TD_Mi_SSHG_Med,
                            TD_Mi_SSHG_Med_F,
                            TD_Mi_Cnt,
                            TD_Mi_Cnt_F,
                            TD_Mi_SS_Cnt,
                            TD_Mi_SS_Cnt_F,
                            TD_Mi_SSHG_Cnt,
                            TD_Mi_SSHG_Cnt_F,
                            TravelTime_Sec,
                            TT_Sec_q2,
                            TT_Sec_q98,
                            TT_Sec_SS_q5,
                            TT_Sec_SS_q95,
                            TT_Sec_SSHG_q5,
                            TT_Sec_SSHG_q95,
                            TT_Sec_Mean,
                            TT_Sec_Mean_F,
                            TT_Sec_SS_Mean,
                            TT_Sec_SS_Mean_F,
                            TT_Sec_SSHG_Mean,
                            TT_Sec_SSHG_Mean_F,
                            TT_Sec_Med,
                            TT_Sec_Med_F,
                            TT_Sec_SS_Med,
                            TT_Sec_SS_Med_F,
                            TT_Sec_SSHG_Med,
                            TT_Sec_SSHG_Med_F,
                            TT_Sec_Cnt,
                            TT_Sec_Cnt_F,
                            TT_Sec_SS_Cnt,
                            TT_Sec_SS_Cnt_F,
                            TT_Sec_SSHG_Cnt,
                            TT_Sec_SSHG_Cnt_F,
                            TravelTime_Hr,
                            TT_Hr_q2,
                            TT_Hr_q98,
                            TT_Hr_SS_q5,
                            TT_Hr_SS_q95,
                            TT_Hr_SSHG_q5,
                            TT_Hr_SSHG_q95,
                            TT_Hr_Mean,
                            TT_Hr_Mean_F,
                            TT_Hr_SS_Mean,
                            TT_Hr_SS_Mean_F,
                            TT_Hr_SSHG_Mean,
                            TT_Hr_SSHG_Mean_F,
                            TT_Hr_Med,
                            TT_Hr_Med_F,
                            TT_Hr_SS_Med,
                            TT_Hr_SS_Med_F,
                            TT_Hr_SSHG_Med,
                            TT_Hr_SSHG_Med_F,
                            TT_Hr_Cnt,
                            TT_Hr_Cnt_F,
                            TT_Hr_SS_Cnt,
                            TT_Hr_SS_Cnt_F,
                            TT_Hr_SSHG_Cnt,
                            TT_Hr_SSHG_Cnt_F,
                            SpeedAvg_Mph
                           )

rm(AllDays_DirChange)
str(select(AllDays_NewOrder,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(AllDays_NewOrder)

# View(head(AllDays_NewOrder, 500))
# View(tail(AllDays_NewOrder, 500))

```


Summarizing the data to help spot anomolies.
```{r}

View(group_by(AllDays_NewOrder,
              Stop_City) %>% 
       summarise(Cnt_Num = n(),
                 Cnt_Pct = 100*Cnt_Num / (nrow(AllDays_NewOrder)
                                         )
                ) %>% 
       arrange(desc(Cnt_Num))
)

summary(AllDays_NewOrder)

```


Investigation of TravelDistance_Mi.

View(TravDistMi_Pctiles): 99% of TravelDistance_Mi are about 1 mile or less...but some weird TravelDistance_Mi values (e.g., 584 miles traveled) exist.
```{r}

TravDistMi_Ntile <- as.data.frame(AllDays_NewOrder$TravelDistance_Mi) %>% 
  mutate(#Pctile = ntile(AllDays_NewOrder$TravelDistance_Mi, 100),
         #MinR = min_rank(AllDays_NewOrder$TravelDistance_Mi),
         PctR = percent_rank(AllDays_NewOrder$TravelDistance_Mi),
         PctR_Round = round(PctR, 2)
        ) 

colnames(TravDistMi_Ntile)[1] <- "TravelDistance_Mi"
# str(TravDistMi_Ntile)

TravDistMi_Ntile_Rows <- nrow(TravDistMi_Ntile)

# View(tail(TravDistMi_Ntile, 500))


TravDistMi_Pctiles <- group_by(TravDistMi_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinTravDistMiAtPctile = min(TravelDistance_Mi),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravDistMi_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

rm(TravDistMi_Ntile)
rm(TravDistMi_Ntile_Rows)

View(TravDistMi_Pctiles)
TravDistMi_Pctiles

```


Investigation of TravelDistance_Mi.

Why are some TravelDistance_Mi "NA"? It looks like partially because the records are the first trip of the day (for that bus), so I purposefully set the distance to "NA". Another reason is due to the odometer recording a value less than the previous odometer recording. In most cases, I have no explanation for this - though I have observed about 67% of all instances where TravelDistance_Mi is NA (other than because it's the first record of the day) are instances where DirChange2 is "Change". This is weird and should be asked to WMATA.
```{r}

# View(head(AllDays_NewOrder, 500))

View(filter(AllDays_NewOrder,
            BusDay_EventNum != 1 # When BusDay_EventNum == 1, TravelDistance_Mi is NA by design (don't want to calculate distance based on yesterday's position)
           ) %>% 
       group_by(StartStop_ID) %>% 
       summarise(Cnts = sum(is.na(TravelDistance_Mi)
                           )
                ) %>% 
       arrange(desc(Cnts)
              )
    )

View(filter(AllDays_NewOrder,
            StartStop_ID == "1000245--1000211"
           ) %>% 
       select(RowNum_OG,
              StartStop_ID,
              Event_Time,
              Event_Time_HrGroup,
              Bus_ID,
              TravelDistance_Mi,
              TravelDistance_Mi_Hvrs,
              TD_Mi_SS_Mean,
              TD_Mi_SS_Mean_F,
              TD_Mi_SSHG_Mean,
              TD_Mi_SSHG_Mean_F,
              TD_Mi_SS_Med,
              TD_Mi_SS_Med_F,
              TD_Mi_SSHG_Med,
              TD_Mi_SSHG_Med_F,
              TD_Mi_SS_Cnt,
              TD_Mi_SS_Cnt_F,
              TD_Mi_SSHG_Cnt,
              TD_Mi_SSHG_Cnt_F
              ) %>% 
       mutate(Ratio_MeanToHvrs = TD_Mi_SS_Mean / TravelDistance_Mi_Hvrs) %>% 
       arrange(Event_Time)
    )

View(filter(AllDays_NewOrder,
            is.na(TravelDistance_Mi)
           )
    )

# These records are NA becuase the record is the first record of the day (the Event_Time_Date)
View(filter(AllDays_NewOrder,
            between(RowNum_OG, 326, 346) | # 336
              between(RowNum_OG, 591, 611) | # 601
              between(RowNum_OG, 845, 865) # 855
           )
    )

```


Investigation of TravelDistance_Mi.

These records are NA becuase the current record odometer is less than the previous record odometer. Theoretically, this should NOT happen. Me: it appears that about 67% of all instances where TravelDistance_Mi is NA (other than because it's th first record of the day) are instances where DirChange2 is "Change". This is weird and should be asked to WMATA.
```{r}

View(filter(AllDays_NewOrder,
            between(RowNum_OG, 194, 214) | # 204
              between(RowNum_OG, 440, 460) | # 450
              between(RowNum_OG, 478, 498) | # 488
              between(RowNum_OG, 510, 530) # 520
           )
    )

TestTable <- filter(AllDays_NewOrder,
                    BusDay_EventNum != 1
                   ) %>% 
  mutate(TravelDistance_NA = as.factor(ifelse(is.na(TravelDistance_Mi),
                                              "True",
                                              "False"
                                             )
                                      )
        ) %>%
  group_by(DirChange2, TravelDistance_NA) %>%
  summarise(TravDistMi_NACnts = n()
           )

# TestTable

TestTable_Spread <- as.data.frame(spread(TestTable,
                                         TravelDistance_NA,
                                         TravDistMi_NACnts
                                        )
                                 ) %>% 
  select(False,
         True
        )

row.names(TestTable_Spread) <- c("Change", "Same")
# str(TestTable_Spread)
# TestTable_Spread

prop.table(as.table(as.matrix(TestTable_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(TestTable_Spread)
                   ),
           2
          )

```


Investigation of TravelDistance_Mi.

Let's look at just the TravelDistance_Mi values that are NOT "NA".
```{r}

rm(TestTable, TestTable_Spread)

TravelDistance_Mi_NoNA <- filter(AllDays_NewOrder,
                                 # TravelDistance_Mi != 0 &
                                 !is.na(TravelDistance_Mi)
                                )

dim(AllDays_NewOrder)
dim(TravelDistance_Mi_NoNA)
nrow(AllDays_NewOrder) - nrow(TravelDistance_Mi_NoNA)

str(TravelDistance_Mi_NoNA)
summary(TravelDistance_Mi_NoNA)

```


Investigation of TravelDistance_Mi.

Let's plot just the TravelDistance_Mi values that are NOT "NA".
```{r}

TravDistMi_HistDen <- ggplot(select(TravelDistance_Mi_NoNA,
                                    TravelDistance_Mi
                                   ),
                             aes(x = TravelDistance_Mi,
                                 y = ..density..
                                )
                            ) +
  geom_histogram(binwidth = 0.05, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 4.0)
                 ) +
  labs(title = "Variation in Distance Between Stops",
       x = "Travel Distance (miles)",
       y = "Density"
      )

TravDistMi_HistDen

```

Investigation of TravelDistance_Mi.

Looking at the extremely large TravelDistance_Mi values. Some (aprox 27%) of TravelDistance_Mi values > 1 mile are when the DirChange2 changes...but what about the other ~73%?
```{r}

rm(TravelDistance_Mi_NoNA)

# examples of weirdly large TravelDistance_Mi
View(filter(AllDays_NewOrder,
            TravelDistance_Mi > 1.1587121212 # 1.1587121212 is the 99th percentile
           ) %>% 
       arrange(desc(TravelDistance_Mi)
              )
    )


# Why are these extremes?  Airports?  Bus collection points?
View(filter(AllDays_NewOrder,
              between(RowNum_OG, 494044, 494064) | # 494054
              between(RowNum_OG, 494273, 494293) | # 494283
              between(RowNum_OG, 494626, 494646) | # 494636
              between(RowNum_OG, 1610156, 1610176) | # 1610166
              between(RowNum_OG, 2073074, 2073094) # 2073084
           )
    )

# Before Removing Runs
# View(filter(AllDays_Sorted,
#             between(RowNum_OG, 494044, 494064) | # 494054
#               between(RowNum_OG, 494273, 494293) | # 494283
#               between(RowNum_OG, 494626, 494646) | # 494636
#               between(RowNum_OG, 1610156, 1610176) | # 1610166
#               between(RowNum_OG, 2073074, 2073094) # 2073084
#            )
#     )

# After Removing Runs
# View(filter(AllDays_FirstStopID,
#             between(RowNum_OG, 494044, 494064) | # 494054
#               between(RowNum_OG, 494273, 494293) | # 494283
#               between(RowNum_OG, 494626, 494646) | # 494636
#               between(RowNum_OG, 1610156, 1610176) | # 1610166
#               between(RowNum_OG, 2073074, 2073094) # 2073084
#            )
#     )

```


Investigation of TravelDistance_Mi.

Any relation with DirChange2?  Doesn't look as if this is so.
```{r}

ExtremeTravDist <- filter(AllDays_NewOrder,
                          !is.na(TravelDistance_Mi)
                         ) %>% 
  mutate(TravDist_Extreme = ifelse(TravelDistance_Mi > 1.1587121212, # 1.1587121212 is the 99th percentile
                                   "True",
                                   "False"
                                  )
                          ) %>% 
  group_by(DirChange2, TravDist_Extreme) %>% 
  summarise(TravDistMI_ExtCnts = n()
           )

# ExtremeTravDist


ExtremeTravDist_Spread <- as.data.frame(spread(ExtremeTravDist,
                                               TravDist_Extreme,
                                               TravDistMI_ExtCnts
                                              )
                                       ) %>% 
  select(False,
         True
        )

row.names(ExtremeTravDist_Spread) <- c("Change", "Same")
# str(ExtremeTravDist_Spread)
# ExtremeTravDist_Spread

prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
                   ),
           2
          )

```


Investigation of TravelDistance_Mi.

Looking at specific buses and StartStop_ID.
```{r}

rm(ExtremeTravDist, ExtremeTravDist_Spread)

View(arrange(group_by(AllDays_NewOrder,
                      Bus_ID
                     ) %>% 
               summarise(DistTrav_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
                         DistTrav_Med = median(TravelDistance_Mi, na.rm = TRUE)
                        ),
             desc(DistTrav_Med)
            )
    )


# example of extremely small TravelDistance_Mi values (looks like the odometer wasn't functioning)
View(filter(AllDays_NewOrder,
            Bus_ID == 6111 |
              Bus_ID == 7201 |
              Bus_ID == 8058
           ) %>% 
       arrange(Bus_ID, Event_Time)
    )


View(arrange(group_by(AllDays_NewOrder,
                      StartStop_ID
                     ) %>% 
               summarise(DistTrav_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
                         DistTrav_Med = median(TravelDistance_Mi, na.rm = TRUE)
                        ),
             desc(DistTrav_Med)
            )
    )

# example of extremely large TravelDistance_Mi values...no idea why...
View(filter(AllDays_NewOrder,
            StartStop_ID == "1003665--12" |
              StartStop_ID == "1003665--5001925" |
              StartStop_ID == "3001038--3002565"
           ) %>% 
       arrange(StartStop_ID, Event_Time)
    )

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_New.

If TravelDisntace_Mi is below the 5th percentile for that StartStop_ID, or if TravelDisntace_Mi is above the 95th percentile for that StartStop_ID, or if TravelDistance_Mi is NA (when the BusDay_EventNum !=1), consider this an outlier.  In this case, replace the value with the mean for that StartStop_ID and HourGroup (TD_Mi_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.
```{r}

# View(tail(AllDays_NewOrder, 500))

AllDays_NewTravelDist <- 
  mutate(AllDays_NewOrder,
         TravelDistance_Mi_New = ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SSHG_Cnt_F >= 20,
                                        TD_Mi_SSHG_Mean_F,
                                 ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SSHG_Cnt_F < 20 &
                                          TD_Mi_SS_Cnt_F >= 20,
                                        TD_Mi_SS_Mean_F,
                                 ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SS_Cnt_F < 20 &
                                          TD_Mi_SS_Cnt >= 20,
                                        TD_Mi_SS_Mean,
                                 ifelse(is.na(TravelDistance_Mi) &
                                          BusDay_EventNum != 1 &
                                          TravelDistance_Mi_Hvrs != 0,
                                        TravelDistance_Mi_Hvrs,
                                 ifelse(is.na(TravelDistance_Mi) &
                                          BusDay_EventNum != 1 &
                                          TravelDistance_Mi_Hvrs == 0,
                                        TD_Mi_SS_Mean,
                                        TravelDistance_Mi
                                       ))))),
         TravelDistance_Mi_New_Label = 
           factor(ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SSHG_Cnt_F >= 20,
                         "TD_Mi_SSHG_Mean_F",
                  ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SSHG_Cnt_F < 20 &
                           TD_Mi_SS_Cnt_F >= 20,
                         "TD_Mi_SS_Mean_F",
                  ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SS_Cnt_F < 20 &
                           TD_Mi_SS_Cnt >= 20,
                         "TD_Mi_SS_Mean",
                  ifelse(is.na(TravelDistance_Mi) &
                           BusDay_EventNum != 1 &
                           TravelDistance_Mi_Hvrs != 0,
                         "TravelDistance_Mi_Hvrs",
                  ifelse(is.na(TravelDistance_Mi) &
                           BusDay_EventNum != 1 &
                           TravelDistance_Mi_Hvrs == 0,
                         "TD_Mi_SS_Mean",
                         "TravelDistance_Mi"
                        )))))
                 ),
         TravelDistance_Mi_NewHvrs = ifelse(!is.na(TravelDistance_Mi_Hvrs) &
                                              TravelDistance_Mi_Hvrs != 0 &
                                              (TravelDistance_Mi_New < TD_Mi_q2 |
                                                 TravelDistance_Mi_New > TD_Mi_q98
                                              ),
                                            TravelDistance_Mi_Hvrs,
                                            TravelDistance_Mi_New
                                           ),
         TravelDistance_Mi_NewHvrs_Label =
           factor(ifelse(!is.na(TravelDistance_Mi_Hvrs) &
                           TravelDistance_Mi_Hvrs != 0 &
                           (TravelDistance_Mi_New < TD_Mi_q2 |
                              TravelDistance_Mi_New > TD_Mi_q98
                           ),
                         "TravelDistance_Mi_Hvrs",
                         as.character(TravelDistance_Mi_New_Label)
                        )
                 ),
         SpeedAvg_Mph_NewHvrs = TravelDistance_Mi_NewHvrs / TravelTime_Hr
        )

rm(AllDays_NewOrder)
str(AllDays_NewTravelDist)

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Quick summary and then correlation calculation.
```{r}

# 38 rows meet this criteria anymore  --  appears to be the case when both the Lat Long calculations, and the TravelDistance calculations did not function properly.
View(filter(AllDays_NewTravelDist,
            is.na(TravelDistance_Mi_New) &
              BusDay_EventNum != 1
           )
    )

View(AllDays_NewTravelDist %>% 
       arrange(desc(TravelDistance_Mi_New)) %>% 
       head(500)
    )

summary(select(AllDays_NewTravelDist,
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      BusDay_EventNum != 1
                     ),
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )


cor(select(AllDays_NewTravelDist,
           TravelDistance_Mi,
           TravelDistance_Mi_Hvrs,
           TravelDistance_Mi_New,
           TravelDistance_Mi_NewHvrs
          ),
    use = "pairwise.complete.obs"
  )

```


Investigation of TravelDistance_Mi_NewHvrs_Label & TravelDistance_Mi_NewHvrs_Label.

Show how the labels changed.
```{r}

group_by(AllDays_NewTravelDist,
         TravelDistance_Mi_New_Label,
         TravelDistance_Mi_NewHvrs_Label
        ) %>% 
  summarise(CntNum = n(),
            CntPct = format(CntNum / nrow(AllDays_NewTravelDist),
                            scientific = 9999
                           )
           ) %>% 
  arrange(desc(CntPct)
         )

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Graphing the two methods of calculating TravelDistance_Mi.

First, let's get create a function to plot the liner model equation.
```{r}

lm_eqn <- function(df, y, x){
  m <- lm(y ~ x, df)
  
  l <- list(a = format(coef(m)[1], digits = 2),
            b = format(abs(coef(m)[2]), digits = 2),
            s1 = ifelse(test = coef(m)[2] > 0,
                        yes = "+",
                        no = "-"
                       ),
            r2 = format(summary(m)$r.squared,
                        digits = 3
                       )
           )
  
  eq <- substitute(italic(y) == a~~s1~~b %.% italic(x)*","~~italic(r)^2~"="~r2,
                   l
                  )
  
  as.character(as.expression(eq)
              )             
}

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_NewHvrs.

Scatter plot (using a 10% sample to making plotting time faster and to reduce un-needed data in the "same" splot).
```{r}

set.seed(123456789)
AllDays_NewTravelDist_10Pct <- filter(AllDays_NewTravelDist,
                                      !is.na(TravelDistance_Mi_NewHvrs) &
                                        !is.na(TravelDistance_Mi)
                                     ) %>% 
  rename(DistMethod = TravelDistance_Mi_NewHvrs_Label) %>% 
  sample_frac(0.1)


TravDist_MiVsCalc <- ggplot(select(AllDays_NewTravelDist_10Pct,
                                   TravelDistance_Mi_NewHvrs,
                                   TravelDistance_Mi,
                                   DistMethod
                                  ),
                            aes(x = TravelDistance_Mi,
                                y = TravelDistance_Mi_NewHvrs,
                                colour = DistMethod
                               )
                           ) +
  scale_colour_manual(values = c("red","blue", "green", "orange", "black")
                     ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "blue") +
  geom_abline(intercept = 0, slope = 1, colour = "red") +
  coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 1.5)
                 ) +
  scale_x_continuous(breaks = seq(0, 1.5, 0.25)
                    ) +
  scale_y_continuous(breaks = seq(0, 1.5, 0.25)
                    ) +
  theme(legend.position = "bottom", #c(0.85, 0.40),
        legend.text = element_text(size = 6)
       ) +
  annotate(label = lm_eqn(df = AllDays_NewTravelDist_10Pct,
                          x = AllDays_NewTravelDist_10Pct$TravelDistance_Mi,
                          y = AllDays_NewTravelDist_10Pct$TravelDistance_Mi_NewHvrs
                         ),
           # x = 62,
           # y = 20,
           x = 0.70,
           y = 0.00,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  annotate(label = "Reference Line (slope = 1)",
           # x = 16,
           # y = 30,
           x = 0.80,
           y = 1.05,
           geom = "text",
           size = 3,
           colour = "red"
          ) +
  labs(title = "TravelDistance_Mi vs. TravelDistance_Mi_NewHvrs",
       x = "TravelDistance_Mi",
       y = "TravelDistance_Mi_NewHvrs"
      )
# +
#   geom_jitter()

TravDist_MiVsCalc

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Graphing test with rbokeh.
```{r}

TravDist_MiVsCalc_Bokeh <- figure(data = select(AllDays_NewTravelDist_10Pct,
                                                TravelDistance_Mi_NewHvrs,
                                                TravelDistance_Mi,
                                                DistMethod
                                               ),
                                  xlim = c(0, 1.5),
                                  ylim = c(0, 1.5),
                                  legend_location = "bottom_right"
                                 ) %>% 
  ly_points(x = TravelDistance_Mi,
            y = TravelDistance_Mi_NewHvrs,
            color = DistMethod,
            hover = c(TravelDistance_Mi_NewHvrs, TravelDistance_Mi, DistMethod)
           ) %>% 
  ly_abline(a = 0, b = 1, color = "red")

TravDist_MiVsCalc_Bokeh

```


Investigation of TravelDistance_Mi_New.

Calculating the minimum TravelDistance_Mi_New value at each percentile.
```{r}

rm(TravDist_MiVsCalc_Bokeh)
rm(AllDays_NewTravelDist_10Pct)


summary(select(AllDays_NewTravelDist,
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      BusDay_EventNum != 1
                     ),
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )


TravDistMiN_Ntile <- as.data.frame(select(AllDays_NewTravelDist,
                                          StartStop_ID,
                                          TravelDistance_Mi_New_Label,
                                          # TravelDistance_Mi_NewHvrs_Label,
                                          TravelDistance_Mi_New
                                          # TravelDistance_Mi_NewHvrs
                                         )
                                  ) %>% 
  mutate(PctR_N = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_New),
         # PctR_H = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_NewHvrs),
         PctR_Round_N = round(PctR_N, 2)
         # PctR_Round_H = round(PctR_H, 2)
        ) 

# str(TravDistMiN_Ntile)
# View(head(TravDistMiN_Ntile, 500))

TravDistMiN_Ntile_Rows <- nrow(TravDistMiN_Ntile)

# View(tail(TravDistMiN_Ntile, 500))


TravDistMiN_Pctiles <- group_by(TravDistMiN_Ntile,
                                PctR_Round_N
                               ) %>% 
  summarise(
    MinTDMiAtPctile_N = min(TravelDistance_Mi_New),
    # MinTDMiAtPctile_H = min(TravelDistance_Mi_NewHvrs),
    CntsAtPctile_N = sum(!is.na(TravelDistance_Mi_New)),
    # CntsAtPctile_H = sum(!is.na(TravelDistance_Mi_NewHvrs)),
    PctsAtPctile_N = CntsAtPctile_N / TravDistMiN_Ntile_Rows
    # PctsAtPctile_H = CntsAtPctile_H / TravDistMiN_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP_N = cumsum(PctsAtPctile_N)
         # CumSumPAtP_H = cumsum(PctsAtPctile_H)
        )

# View(TravDistMiN_Pctiles)
TravDistMiN_Pctiles

```


Investigation of TravelDistance_Mi_NewHvrs

Calculating the minimum TravelDistance_Mi_NewHvrs value at each percentile.
```{r}

TravDistMiH_Ntile <- as.data.frame(select(AllDays_NewTravelDist,
                                          StartStop_ID,
                                          # TravelDistance_Mi_New_Label,
                                          TravelDistance_Mi_NewHvrs_Label,
                                          # TravelDistance_Mi_New,
                                          TravelDistance_Mi_NewHvrs
                                         )
                                  ) %>% 
  mutate(# PctR_N = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_New),
         PctR_H = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_NewHvrs),
         # PctR_Round_N = round(PctR_N, 2),
         PctR_Round_H = round(PctR_H, 2)
        ) 

# str(TravDistMiH_Ntile)
# View(head(TravDistMiH_Ntile, 500))

TravDistMiH_Ntile_Rows <- nrow(TravDistMiH_Ntile)

# View(tail(TravDistMiH_Ntile, 500))


TravDistMiH_Pctiles <- group_by(TravDistMiH_Ntile,
                                PctR_Round_H
                               ) %>% 
  summarise(
    # MinTDMiAtPctile_N = min(TravelDistance_Mi_New),
    MinTDMiAtPctile_H = min(TravelDistance_Mi_NewHvrs),
    # CntsAtPctile_N = sum(!is.na(TravelDistance_Mi_New)),
    CntsAtPctile_H = sum(!is.na(TravelDistance_Mi_NewHvrs)),
    # PctsAtPctile_N = CntsAtPctile_N / TravDistMiH_Ntile_Rows,
    PctsAtPctile_H = CntsAtPctile_H / TravDistMiH_Ntile_Rows
  ) %>% 
  mutate(# CumSumPAtP_N = cumsum(PctsAtPctile_N),
         CumSumPAtP_H = cumsum(PctsAtPctile_H)
        )

# View(TravDistMiH_Pctiles)
TravDistMiH_Pctiles

```


Join TravDistMiH_Pctiles, TravDistMiN_Pctiles, and TravDistMi_Pctiles.

~11% of rides are still showing as less than 0.1 miles of TravelDistance_Mi_NewHvrs.
```{r}

rm(TravDistMiN_Ntile_Rows, TravDistMiH_Ntile_Rows, TravDistMiN_Ntile, TravDistMiH_Ntile)


# View(TravDistMi_Pctiles)
# View(TravDistMiN_Pctiles)
# View(TravDistMiH_Pctiles)

TravDistMi_Pctiles_All <- inner_join(x = TravDistMi_Pctiles,
                                     y = TravDistMiN_Pctiles,
                                     by = c("PctR_Round" = "PctR_Round_N")
                                    ) %>% 
  inner_join(y = TravDistMiH_Pctiles,
             by = c("PctR_Round" = "PctR_Round_H")
            ) %>% 
  select(PctR_Round,
         MinTravDistMiAtPctile,
         MinTDMiAtPctile_N,
         MinTDMiAtPctile_H,
         CntsAtPctile,
         CntsAtPctile_N,
         CntsAtPctile_H,
         PctsAtPctile,
         PctsAtPctile_N,
         PctsAtPctile_H,
         CumSumPAtP,
         CumSumPAtP_N,
         CumSumPAtP_H
         )

# str(TravDistMi_Pctiles_All)

rm(TravDistMi_Pctiles, TravDistMiN_Pctiles,TravDistMiH_Pctiles)


View(TravDistMi_Pctiles_All)
TravDistMi_Pctiles_All

```


Investigation of TravelDistance_Mi_New.

Why are there still some small or large TravelDistance_Mi_NewHvrs values.
```{r}

# View(filter(AllDays_NewTravelDist,
#             !is.na(TravelDistance_Mi_NewHvrs)
#            ) %>% 
#        select(-TD_Mi_q2,
#               -TD_Mi_q98,
#               -TD_Mi_SS_q5,
#               -TD_Mi_SS_q95,
#               -TD_Mi_SSHG_q5,
#               -TD_Mi_SSHG_q95,
#               -TD_Mi_Mean,
#               -TD_Mi_Mean_F,
#               -TD_Mi_SS_Mean,
#               -TD_Mi_SS_Mean_F,
#               -TD_Mi_SSHG_Mean,
#               -TD_Mi_SSHG_Mean_F,
#               -TD_Mi_Med,
#               -TD_Mi_Med_F,
#               -TD_Mi_SS_Med,
#               -TD_Mi_SS_Med_F,
#               -TD_Mi_SSHG_Med,
#               -TD_Mi_SSHG_Med_F,
#               -TD_Mi_Cnt,
#               -TD_Mi_Cnt_F,
#               -TD_Mi_SS_Cnt,
#               -TD_Mi_SS_Cnt_F,
#               -TD_Mi_SSHG_Cnt,
#               -TD_Mi_SSHG_Cnt_F,
#               -TT_Sec_q2,
#               -TT_Sec_q98,
#               -TT_Sec_SS_q5,
#               -TT_Sec_SS_q95,
#               -TT_Sec_SSHG_q5,
#               -TT_Sec_SSHG_q95,
#               -TT_Sec_Mean,
#               -TT_Sec_Mean_F,
#               -TT_Sec_SS_Mean,
#               -TT_Sec_SS_Mean_F,
#               -TT_Sec_SSHG_Mean,
#               -TT_Sec_SSHG_Mean_F,
#               -TT_Sec_Med,
#               -TT_Sec_Med_F,
#               -TT_Sec_SS_Med,
#               -TT_Sec_SS_Med_F,
#               -TT_Sec_SSHG_Med,
#               -TT_Sec_SSHG_Med_F,
#               -TT_Sec_Cnt,
#               -TT_Sec_Cnt_F,
#               -TT_Sec_SS_Cnt,
#               -TT_Sec_SS_Cnt_F,
#               -TT_Sec_SSHG_Cnt,
#               -TT_Sec_SSHG_Cnt_F,
#               -TT_Hr_q2,
#               -TT_Hr_q98,
#               -TT_Hr_SS_q5,
#               -TT_Hr_SS_q95,
#               -TT_Hr_SSHG_q5,
#               -TT_Hr_SSHG_q95,
#               -TT_Hr_Mean,
#               -TT_Hr_Mean_F,
#               -TT_Hr_SS_Mean,
#               -TT_Hr_SS_Mean_F,
#               -TT_Hr_SSHG_Mean,
#               -TT_Hr_SSHG_Mean_F,
#               -TT_Hr_Med,
#               -TT_Hr_Med_F,
#               -TT_Hr_SS_Med,
#               -TT_Hr_SS_Med_F,
#               -TT_Hr_SSHG_Med,
#               -TT_Hr_SSHG_Med_F,
#               -TT_Hr_Cnt,
#               -TT_Hr_Cnt_F,
#               -TT_Hr_SS_Cnt,
#               -TT_Hr_SS_Cnt_F,
#               -TT_Hr_SSHG_Cnt,
#               -TT_Hr_SSHG_Cnt_F
#              ) %>% 
#        arrange(TravelDistance_Mi_NewHvrs) %>% 
#        head(500)
#     )

View(filter(AllDays_NewTravelDist,
            !is.na(TravelDistance_Mi_NewHvrs)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             ) %>% 
       arrange(TravelDistance_Mi_NewHvrs) %>%
       head(500)
    )

# examples of the smallest TravelDistance_Mi_NewHvrs values.
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1424440 & RowNum_OG <= 1424460) | # 1424450  --  direction change
                (RowNum_OG >= 763292 & RowNum_OG <= 763312) | # 763302  --  direction change
                (RowNum_OG >= 1679093 & RowNum_OG <= 1679113) | # 1679103  --  direction change
                (RowNum_OG >= 2860918 & RowNum_OG <= 2860938) # 2860928  --  looks correct
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )


View(filter(AllDays_NewTravelDist,
            !is.na(TravelDistance_Mi_NewHvrs)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             ) %>% 
       arrange(desc(TravelDistance_Mi_NewHvrs)
              ) %>%
       head(500)
    )

# examples of the largest TravelDistance_Mi_NewHvrs values.
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1092000 & RowNum_OG <= 1092050) | # 1092030  --  direction change
                (RowNum_OG >= 1609460 & RowNum_OG <= 1609480) | # 1609470  -- direction change 
                (RowNum_OG >= 508904 & RowNum_OG <= 508924) | # 508914  --  direction change & original StopID was bad
                (RowNum_OG >= 2476345 & RowNum_OG <= 2476365) # 2476355  --  direction change
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )

```


Investigation of TravelTime_Hr.

View(TravDistMi_Pctiles): 98% of TravelTime_Hr are between 7 seconds and 464 seconds (~8 minutes).
```{r}

TravTimeHr_Ntile <- select(AllDays_NewTravelDist,
                           TravelTime_Hr
                          ) %>% 
  mutate(# Pctile = ntile(AllDays_NewTravelDist$TravelTime_Hr, 100),
         # MinR = min_rank(AllDays_NewTravelDist$TravelTime_Hr),
         PctR = percent_rank(AllDays_NewTravelDist$TravelTime_Hr),
         PctR_Round = round(PctR, 2)
        ) 

# str(TravTimeHr_Ntile)

TravTimeHr_Ntile_Rows <- nrow(TravTimeHr_Ntile)

# View(tail(TravTimeHr_Ntile, 500))


TravTimeHr_Pctiles <- group_by(TravTimeHr_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinTravTimeHrAtPctile = min(TravelTime_Hr),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravTimeHr_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile),
         MinTravTimeSecAtPctile = MinTravTimeHrAtPctile * 3600
        )

rm(TravTimeHr_Ntile_Rows)
rm(TravTimeHr_Ntile)
View(TravTimeHr_Pctiles)
TravTimeHr_Pctiles

```


Investigation of TravelTime_Hr.

Histogram of TravelTime_Sec.
```{r}

TravTime_Sec_HistDen <- ggplot(filter(select(AllDays_NewTravelDist,
                                             TravelTime_Sec
                                            ),
                                      !is.na(TravelTime_Sec)
                                     ),
                               aes(x = TravelTime_Sec,
                                   y = ..density..
                                  )
                          ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  # stat_bin(binwidth = 5,
  #          geom = "text",
  #          size = 2.5,
  #          vjust = 1.5,
  #          aes(label = format(..count.., big.mark = ",")
  #             ),
  #         ) +
  coord_cartesian(xlim = c(0, 180), ylim = c(0, 0.02)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Time",
       x = "Travel Time (sec)",
       y = "Density"
      )

TravTime_Sec_HistDen

```


Investigation of TravelTime_Sec.

TravelTime_Sec values are NA.
```{r}

summary(AllDays_NewTravelDist$TravelTime_Sec)


View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(is.na(TravelTime_Sec) &
                BusDay_EventNum != 1  # TravelTime purposefully not calculated here
             )
    )

# examples of TravelTime_Sec values that are NA. These are NA because the Event_Time & Departure_Time readings are not accurate (i.e., the previous Departure_Time is BEFORE or EQUAL TO the current Event_Time).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 90809 & RowNum_OG <= 90829) | # 90819
                (RowNum_OG >= 90881 & RowNum_OG <= 90901) | # 90891
                (RowNum_OG >= 2597066 & RowNum_OG <= 2597086) | # 2597076
                (RowNum_OG >= 2613305 & RowNum_OG <= 2613325) # 2613315
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt"))
    )

```


Investigation of TravelTime_Sec.

TravelTime_Sec values are extremely small.
```{r}

View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(!is.na(TravelTime_Sec)
             ) %>% 
       arrange(TravelTime_Sec,
               desc(SpeedAvg_Mph_NewHvrs)
              ) %>%
       head(500)
    )

# examples where TravelTime_Sec is small (1 sec) and SpeedAvg_Mph_NewHvrs is large.
View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter((RowNum_OG >= 2217353 & RowNum_OG <= 2217373) | # 2217363
                (RowNum_OG >= 3090321 & RowNum_OG <= 3090341) | # 3090331
                (RowNum_OG >= 80764 & RowNum_OG <= 80784) | # 80774
                (RowNum_OG >= 33840 & RowNum_OG <= 33860) # 33850
           )
    )

```


Investigation of TravelTime_Sec.

TravelTime_Sec values are extremely large.
```{r}

View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(!is.na(TravelTime_Sec)
             ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )

# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter((RowNum_OG >= 1007703 & RowNum_OG <= 1007723) | # 1007713
                (RowNum_OG >= 2373564 & RowNum_OG <= 2373584) | # 2373574
                (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
                (RowNum_OG >= 2570060 & RowNum_OG <= 2570080) # 2570070
           )
    )

```


Investigation of TravelTime_Sec.

Are large TravelTime_Sec values related to RouteChanges? Looks likely. When the Bus involves a Route "change", there is almost twice as likely to be a case of an outlier TravelTime_Sec value (on the high side).
```{r}

TTLargeRteChng <- select(AllDays_NewTravelDist,
                         -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
                        ) %>% 
  mutate(TT_Out = factor(ifelse(TravelTime_Sec > 464,  # this is the 99th percentile
                                "Outlier",
                                "Normal"
                               )
                        )
        )

# str(TTLargeRteChng)


TTLargeRteChng_Cnts <- group_by(TTLargeRteChng,
                                RteChange2,
                                TT_Out
                               ) %>% 
  summarise(Cnts = n()
           )

TTLargeRteChng_Spread <- as.data.frame(spread(TTLargeRteChng_Cnts,
                                              TT_Out,
                                              Cnts
                                             )
                                      ) %>%
  select(-RteChange2)

row.names(TTLargeRteChng_Spread) <- c("Change", "Same")
# str(TTLargeRteChng_Spread)


# When the Bus involves a Route "change", there is almost twice as likely to be a case of an outlier TravelTime_Sec value.
TTLargeRteChng_Spread
prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
                   ),
           2
          )

# rm(TTLargeRteChng, TTLargeRteChng_Spread)
         
```


Investigation of TravelTime_Sec.

Are large TravelTime_Sec values related to RouteChanges? Looks likely.
```{r}

View(filter(TTLargeRteChng,
            !is.na(TravelTime_Sec) &
              RteChange2 == "Same"
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )


# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(filter(TTLargeRteChng,
            (RowNum_OG >= 2250290 & RowNum_OG <= 2250310) | # 2250300
              (RowNum_OG >= 867717 & RowNum_OG <= 867737) | # 867727
              (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
              (RowNum_OG >= 808395 & RowNum_OG <= 808415) # 808405
           )
    )


```


Investigation of TravelTime_Sec.

If TravelTime_Sec is below the 5th percentile for that StartStop_ID, or if TravelTime_Sec is above the 95th percentile for that StartStop_ID,  consider this an outlier.  In this case, replace the value with the mean for that StartStop_ID and HourGroup (TT_Sec_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.
```{r}

rm(TTLargeRteChng, TTLargeRteChng_Cnts, TTLargeRteChng_Spread)


NewTravTime <- mutate(AllDays_NewTravelDist,
                      TT_Sec_New = ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SSHG_Cnt_F >= 20,
                                          TT_Sec_SSHG_Mean_F,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SSHG_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt_F >= 20,
                                          TT_Sec_SS_Mean_F,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SS_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt >= 20,
                                          TT_Sec_SS_Mean,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SS_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt < 20 &
                                            RteChange2 == "Change",
                                          NA,
                                          TravelTime_Sec
                                         )))),
                      
                      TT_Sec_New_Label = 
           factor(ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SSHG_Cnt_F >= 20,
                         "TT_Sec_SSHG_Mean_F",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SSHG_Cnt_F < 20 &
                           TT_Sec_SS_Cnt_F >= 20,
                         "TT_Sec_SS_Mean_F",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                            ) &
                           TT_Sec_SS_Cnt_F < 20 &
                           TT_Sec_SS_Cnt >= 20,
                         "TT_Sec_SS_Mean",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SS_Cnt_F < 20 &
                           TT_Sec_SS_Cnt < 20 &
                           RteChange2 == "Change",
                         NA,
                         "TravelTime_Sec"
                        ))))
                 ),
                  
                  TT_Hr_New = TT_Sec_New / (60 * 60)
           )


dim(AllDays_NewTravelDist)
dim(NewTravTime)
rm(AllDays_NewTravelDist)

summary(select(NewTravTime,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )

str(select(NewTravTime,
           TravelTime_Sec,
           TT_Sec_New,
           TT_Sec_New_Label,
           TT_Hr_New
          )
   )


summary(select(NewTravTime,
               TravelTime_Sec,
               TT_Sec_New,
               TT_Sec_New_Label,
               TT_Hr_New
              )
       )

```


Test investigation of just the X2 Route. Box plots for time between bus arrivals (by HourGroup).
```{r}

View(head(select(NewTravTime,
                 -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
                )
         )
    )

X2 <- select(NewTravTime,
             -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
            ) %>% 
  filter(Route == "X2")

str(X2)

View(head(arrange(X2,
                  Bus_ID,
                  Event_Time
                 ),
          500
         )
    )

X2_ByStop <- group_by(X2,
                      StopID_Clean
                     ) %>% 
  arrange(StopID_Clean,
          Event_Time) %>% 
  mutate(Event_Time_L1 = lag(Event_Time),
         TimeToEvent_Sec = as.numeric(Event_Time - Event_Time_L1),
         TimeToEvent_Min = TimeToEvent_Sec / 60
        )

View(head(X2_ByStop, 500))


# Count_Values is needed to display the medians on the box plots
Count_Values <- ddply(as.data.frame(X2_ByStop),
                      .(Event_Time_HrGroup),
                      summarise,
                      Value_Counts = median(TimeToEvent_Min, na.rm = TRUE)
                     )

TimeBtwEvents_X2_BoxPlot <- ggplot(select(as.data.frame(X2_ByStop),
                                          TimeToEvent_Min,
                                          Event_Time_HrGroup
                                         ),
                                   aes(factor(Event_Time_HrGroup),
                                       TimeToEvent_Min,
                                       fill = factor(Event_Time_HrGroup)
                                      )
                                  ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 120)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Hour Group",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_BoxPlot

```


Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Hour Group).
```{r}

TimeBtwEvents_X2_ViolinPlot <- ggplot(select(as.data.frame(X2_ByStop),
                                             TimeToEvent_Min,
                                             Event_Time_HrGroup
                                             ),
                                      aes(factor(Event_Time_HrGroup),
                                          TimeToEvent_Min,
                                          fill = factor(Event_Time_HrGroup)
                                         )
                                     ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 80)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Hour Group",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_ViolinPlot

```


Test investigation of just the X2 Route. Box plots for time between bus arrivals (by Zip Code).
```{r}

# Count_Values is needed to display the medians on the box plots
Count_Values_z <- ddply(as.data.frame(X2_ByStop),
                        .(Stop_Zip),
                        summarise,
                        Value_Counts = median(TimeToEvent_Min, na.rm = TRUE)
                       )

TimeBtwEvents_X2_BoxPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
                                            TimeToEvent_Min,
                                            Stop_Zip
                                           ),
                                     aes(factor(Stop_Zip),
                                         TimeToEvent_Min,
                                         fill = factor(Stop_Zip)
                                        )
                                    ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = Count_Values_z,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 100)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Zip Code of Destination",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_BoxPlot_z

```


Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).
```{r}

TimeBtwEvents_X2_ViolinPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
                                               TimeToEvent_Min,
                                               Stop_Zip
                                               ),
                                        aes(factor(Stop_Zip),
                                            TimeToEvent_Min,
                                            fill = factor(Stop_Zip)
                                           )
                                       ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = Count_Values_z,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 60)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Zip Code of Destination",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_ViolinPlot_z

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

First, get the max and min times of bus stops (each day, and for each route).
```{r}

rm(X2, X2_ByStop, X2_Long, X2_Pct)


RouteMinMax <- group_by(NewTravTime,
                        Route,
                        Event_Time_Date
                       ) %>% 
  summarise(MinTime = min(Event_Time),
            MaxTime = max(Event_Time)
           )

str(RouteMinMax)
View(RouteMinMax)

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

(Pulls here are done by day, as the data are too large to do at once.)
```{r}

# View(head(NewTravTime, 500))

# For each record, create a random datetime between the first and last stop for that bus route (on that day).
for(i in 3:7){

set.seed(123456789)
Samp <- select(NewTravTime,
               RowNum_OG,
               Route,
               # RouteGroup,
               Event_Time_Date,
               StopID_Clean,
               starts_with("Event")
              ) %>% 
  filter(Event_Time_Date == i) %>%  # needed to do this each day (3-7) because the complete file was too large to do at once
  left_join(RouteMinMax,
            by = c("Route" = "Route",
                   "Event_Time_Date" = "Event_Time_Date"
                  )
           ) %>% 
  mutate(SampTime = as_datetime(runif(nrow(.), #200000,
                                      min = MinTime,
                                      max = MaxTime
                                     ),
                                tz = "America/New_York"
                               )
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) 

# str(Samp)
# View(head(Samp, 500))
# 
# View(
# group_by(Samp,
#          RowNum_OG
#         ) %>%
#   summarise(Cnt_Num = n(),
#             Cnt_Pct = 100 * Cnt_Num / nrow(Samp)
#            ) %>%
#   arrange(desc(Cnt_Num))
# )


# For each Route and StopID combination, get all the Event_Time values that are after the SampTime value.
# estimating approx 2hrs of runtime for all 2.8m records
Testing_A <- sqldf("   Select               t1.*
                                            ,t2.Event_Time             as NextBus
                        From                 Samp                      as t1
                             Inner Join      Samp                      as t2
                                On              t1.Route = t2.Route
                                And             t1.StopID_Clean = t2.StopID_Clean
                                And             t2.Event_Time > t1.SampTime
                        Order By             t1.Route
                                            ,t1.StopID_Clean
                                            ,t1.Event_Time
                                            ,t2.Event_Time
                  "
                 ) %>% 
  mutate(NB = as_datetime(NextBus,
                          tz = "America/New_York"
                         )
        )

# str(Testing_A)
# View(head(Testing_A, 500))
# View(head(Samp, 500))


# Filter the dataframe to only include the bus arrival at StopID that is the next to come after the SampTime.
# estimating approx 20min of runtime for all 2.8m records
Testing <- select(Testing_A,
                  -NextBus
                 ) %>% 
  group_by(RowNum_OG) %>% 
  filter(NB == min(NB)
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) %>% 
  mutate(WaitTime_Min = as.numeric(NB - SampTime),
         WaitTime_Sec = WaitTime_Min * 60,
         WaitTime_Sec2 = NB - SampTime,
         WaitTime_Min2 = WaitTime_Sec2 / 60
        ) %>% 
  as.data.frame()

assign(paste0("Testing_", i),
       Testing
      )

rm(Samp,Testing_A, Testing)
str(get(paste0("Testing_", i)))
View(get(paste0("Testing_", i)))
}


# Bind all the individual dataframes together.
WaitData_DayPull <- bind_rows(Testing_3,
                              Testing_4,
                              Testing_5,
                              Testing_6,
                              Testing_7
                             ) %>% 
  mutate(WaitTime_Sec3 = NB - SampTime,
         WaitTime_Min3 = WaitTime_Sec3 / 60
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )

rm(Testing_3, Testing_4, Testing_5, Testing_6, Testing_7)
str(WaitData_DayPull)
View(head(WaitData_DayPull, 500))
View(tail(WaitData_DayPull, 500))

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

Basic investigation of any missing rows from data pulled by day.
```{r}

DistinctRowNum_OG <- distinct(select(WaitData_DayPull,
                                     RowNum_OG
                                    )
                             )

str(DistinctRowNum_OG)

# View(
# anti_join(Samp,
#           DistinctRowNum_OG,
#           by = c("RowNum_OG" = "RowNum_OG")
#          )
# )


# The samp time is AFTER the last bus passed that StopID_Clean
# View(filter(Samp,
#             Event_Time > "2016-10-07 19:48:41" &
#               Route == "X2" &
#               StopID_Clean == 1003774
#            )
#     )

# Next Bus (NB) can be on the next morning
# View(filter(Testing7,
#             SampTime > "2016-10-06 23:58:00" &
#               SampTime < "2016-10-06 23:59:59")
#     )

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)

First, we need to find the most common bus routes.
```{r}

rm(DistinctRowNum_OG)


# View(head(NewTravTime, 500))

set.seed(123456789)
BusGroups <- group_by(NewTravTime,
                      Route
                     ) %>% 
  summarise(Cnt_Num = n(),
            Cnt_Pct = Cnt_Num / nrow(NewTravTime)
           ) %>% 
  arrange(desc(Cnt_Num)
         ) %>% 
  mutate(RowNum = row_number(),
         RandNum = runif(n = 268),
         RouteGroup = ifelse(RandNum <= 0.2,
                             1,
                      ifelse(RandNum <= 0.4,
                             2,
                      ifelse(RandNum <= 0.6,
                             3,
                      ifelse(RandNum <= 0.8,
                             4,
                             5
                            ))))
        )

str(BusGroups)
View(BusGroups)
summary(BusGroups)

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)
```{r}

# View(head(NewTravTime, 500))

# For each record, create a random datetime between the first and last stop for that bus route (on that day).
for(i in 1:5){
  
set.seed(123456789)
Samp <- left_join(NewTravTime,
                  BusGroups,
                  by = c("Route" = "Route")
                  ) %>% 
  select(RowNum_OG,
         Route,
         RouteGroup,
         Event_Time_Date,
         StopID_Clean,
         starts_with("Event")
        ) %>% 
  filter(RouteGroup == i) %>%  # needed to do this each RouteGroup (1-5) because the complete file was too large to do at once
  left_join(RouteMinMax,
            by = c("Route" = "Route",
                   "Event_Time_Date" = "Event_Time_Date"
                  )
           ) %>% 
  mutate(SampTime = as_datetime(runif(nrow(.), #200000,
                                      min = MinTime,
                                      max = MaxTime
                                     ),
                                tz = "America/New_York"
                               )
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) 

# str(Samp)
# View(head(Samp, 500))
# 
# View(
# group_by(Samp,
#          RowNum_OG
#         ) %>%
#   summarise(Cnt_Num = n(),
#             Cnt_Pct = 100 * Cnt_Num / nrow(Samp)
#            ) %>%
#   arrange(desc(Cnt_Num))
# )


# For each Route and StopID combination, get all the Event_Time values that are after the SampTime value.
# estimating approx 2hrs of runtime for all 2.8m records
Testing_A <- sqldf("   Select               t1.*
                                            ,t2.Event_Time             as NextBus
                        From                 Samp                      as t1
                             Inner Join      Samp                      as t2
                                On              t1.Route = t2.Route
                                And             t1.StopID_Clean = t2.StopID_Clean
                                And             t2.Event_Time > t1.SampTime
                        Order By             t1.Route
                                            ,t1.StopID_Clean
                                            ,t1.Event_Time
                                            ,t2.Event_Time
                  "
                 ) %>% 
  mutate(NB = as_datetime(NextBus,
                          tz = "America/New_York"
                         )
        )

# str(Testing_A)
# View(head(Testing_A, 500))
# View(head(Samp, 500))


# Filter the dataframe to only include the bus arrival at StopID that is the next to come after the SampTime.
# estimating approx 20min of runtime for all 2.8m records
Testing <- select(Testing_A,
                  -NextBus
                 ) %>% 
  group_by(RowNum_OG) %>% 
  filter(NB == min(NB)
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) %>% 
  mutate(WaitTime_Min = as.numeric(NB - SampTime),
         WaitTime_Sec = WaitTime_Min * 60
        ) %>% 
  as.data.frame()

assign(paste0("Testing", i),
       Testing
      )

rm(Samp,Testing_A, Testing)
str(get(paste0("Testing", i)))
View(get(paste0("Testing", i)))
}


# Bind all the individual dataframes together.
WaitData_RoutePull <- bind_rows(Testing1,
                                Testing2,
                                Testing3,
                                Testing4,
                                Testing5
                             ) %>% 
  mutate(WaitTime_Sec2 = NB - SampTime,
         WaitTime_Min2 = WaitTime_Sec2 / 60
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )

rm(BusGroups, i, Testing3, Testing4, Testing5, Testing6, Testing7)
str(WaitData_RoutePull)
View(head(WaitData_RoutePull, 500))
View(tail(WaitData_RoutePull, 500))

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

Compare WaitData pulled by day and pulled by route.
```{r}

dim(WaitData_RoutePull)
dim(WaitData_DayPull)
nrow(WaitData_RoutePull) - nrow(WaitData_DayPull)

WaitData_Diff <- anti_join(WaitData_RoutePull,
                           WaitData_DayPull,
                           by = c("RowNum_OG" = "RowNum_OG"
                                 )
                          ) %>% 
  select(-WaitTime_Min,
         -WaitTime_Sec
        )

str(WaitData_Diff)
View(head(WaitData_Diff, 500))

View(filter(WaitData_RoutePull,
            Route == "Z8" &
              StopID_Clean == 2005465
            # RowNum_OG = 2902760
            # Event_Time = 2016-10-07 19:51:47
           )
    )

View(group_by(WaitData_Diff,
              Route
             ) %>% 
       summarise(Cnt_Num = n(),
                 Cnt_Pct = Cnt_Num / nrow(WaitData_Diff)
                ) %>% 
       arrange(desc(Cnt_Num)
              )
    )

View(filter(WaitData_Diff,
            Route == "S1"
           )
    )

View(filter(WaitData_RoutePull,
            Route == "S1" &
              StopID_Clean == 1003132
            # RowNum_OG = 1151770
            # Event_Time = 2016-10-07 09:07:12
           )
    )

# Can't tell why the pull by day has less records than the pull by route

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

Compare WaitData (pulled by route) and original data (NewTravTime).
```{r}

dim(NewTravTime)  # 2,809,529 rows
dim(WaitData_RoutePull)  # 2,780,848 rows
nrow(NewTravTime) - nrow(WaitData_RoutePull)  # is 28,681 rows

str(select(NewTravTime,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(WaitData_RoutePull)

Compare_NTT_WD <- left_join(NewTravTime,
                            select(WaitData_RoutePull,
                                   RowNum_OG,
                                   # Route,
                                   RouteGroup,
                                   # StopID_Clean,
                                   # Event_Time,
                                   MinTime,
                                   MaxTime,
                                   SampTime,
                                   NB,
                                   WaitTime_Sec2,
                                   WaitTime_Min2
                                  ),
                            by = c("RowNum_OG" = "RowNum_OG")
                           ) %>% 
  select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )

str(Compare_NTT_WD)  # 2,810,109 rows overall  --  29,261 rows with no match
View(head(Compare_NTT_WD, 500))
View(filter(Compare_NTT_WD,
            is.na(MinTime)
           )
    )



# View(anti_join(Samp,
#                distinct(select(WaitData_RoutePull,
#                                RowNum_OG
#                               )
#                        ),
#                by = c("RowNum_OG" = "RowNum_OG")
#               )
#     )

# The SampTime is AFTER the last bus passed that StopID_Clean
# View(filter(Samp,
#               Route == "X2" &
#               StopID_Clean == 1003774
#             # RowNum_OG = 1146723
#             # Event_Time = 2016-10-07 15:32:18
#            )
#     )

```


Clean up the data a bit.
```{r}

rm(BusGroups, RouteMinMax, Samp, Testing1, Testing2, Testing3, Testing4, Testing5, Testing_3, Testing_4, Testing_5, Testing_6, Testing_7, WaitData_DayPull, WaitData_Diff)


str(Compare_NTT_WD)
View(head(Compare_NTT_WD, 500))
View(head(mutate(Compare_NTT_WD,
                 WT_Min = as.numeric(WaitTime_Min2)
                )
         )
    )

WaitTime_AsNum <- Compare_NTT_WD %>% 
  mutate(RouteStop_ID = factor(paste(Route, StopID_Clean, sep = "__")
                              )
        )
WaitTime_AsNum$WaitTime_Sec2 <- as.numeric(WaitTime_AsNum$WaitTime_Sec2)
WaitTime_AsNum$WaitTime_Min2 <- as.numeric(WaitTime_AsNum$WaitTime_Min2)

rm(Compare_NTT_WD)
str(WaitTime_AsNum)

```


General exploration of wait times.
```{r}

summary(WaitTime_AsNum$WaitTime_Min2)

```


General exploration of wait times.
```{r}

WT_Quantiles <- as.data.frame(quantile(WaitTime_AsNum$WaitTime_Min2,
                                       probs = seq(0, 1, 0.01),
                                       na.rm = TRUE
                                      )
                             )

colnames(WT_Quantiles) <- "Value_Min"

WT_Quantiles$Value_Sec = format(round(WT_Quantiles$Value_Min * 60,
                                      digits = 2
                                     ),
                                nsmall = 2
                               )
WT_Quantiles$Value_Hr = format(round(WT_Quantiles$Value_Min / 60,
                                     digits = 2
                                    ),
                                nsmall = 2
                               )
WT_Quantiles$Value_Min = format(round(WT_Quantiles$Value_Min,
                                      digits = 2
                                     ),
                                nsmall = 2
                               )

WT_Quantiles$Quantile <- seq(0, 1, 0.01)

WT_Quantiles <- select(WT_Quantiles,
                       Quantile,
                       Value_Sec,
                       Value_Min,
                       Value_Hr
                      )

str(WT_Quantiles)
View(WT_Quantiles)
WT_Quantiles


View(arrange(WaitTime_AsNum,
             desc(WaitTime_Min2)
            ) %>% 
       head(., 5000)
    )

View(filter(WaitTime_AsNum,
            between(WaitTime_Min2, 60, 200)
           ) %>% 
       arrange(desc(WaitTime_Min2)
              ) 
     # %>% 
     #   head(., 5000)
    )

# Example of extreme wait times
View(filter(WaitTime_AsNum,
            Route == "W13" &  # only 2 bus passes in the entire dataset
              StopID_Clean == 1003728
            # Event_Time = 2016-10-03 08:42:46
           )
    )

# Example of extreme wait times
View(filter(WaitTime_AsNum,
            Route == "S41" &  # only 4 bus passes in the entire dataset
              StopID_Clean == 1001095
            # Event_Time = 2016-10-05 15:41:47
           )
    )

# Example of extreme wait times
View(filter(WaitTime_AsNum,
            Route == "D8" &  # route has VERY limited service after midnight
              StopID_Clean == 1001669
            # Event_Time = 2016-10-06 20:31:16
           )
    )

```


Looks like there might be an issue in wait times when very few Route-Stop combinations are included in the dataset.  Let's explore these.
```{r}

RouteStop_Cnts <- group_by(WaitTime_AsNum,
                           RouteStop_ID
                          ) %>% 
  summarise(RouteStop_CntNum = n(),
            RouteStop_CntPct = RouteStop_CntNum / nrow(WaitTime_AsNum)
           ) %>% 
  arrange(RouteStop_CntNum)

View(RouteStop_Cnts)


RouteStop_CntOfCnt <- group_by(RouteStop_Cnts,
                               RouteStop_CntNum
                              ) %>% 
  summarise(RouteStopCnt_CntNum = n(),
            RouteStopCnt_CntPct = RouteStopCnt_CntNum / nrow(RouteStop_Cnts)
           ) %>% 
  mutate(RouteStopCnt_CntPct_CumSum = cumsum(RouteStopCnt_CntPct),
         x = 1 - RouteStopCnt_CntPct_CumSum
        ) %>% 
  arrange(RouteStop_CntNum)
  
 View(RouteStop_CntOfCnt)
 RouteStop_CntOfCnt

```


Histogram of the counts of Route-StopID combinations.
```{r}

RouteStop_Cnts_Bar <- ggplot(RouteStop_CntOfCnt,
                             aes(x = RouteStop_CntNum,
                                 # y = ..density..
                                 y = RouteStopCnt_CntNum
                                )
                            ) +
  # geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
  coord_cartesian(xlim = c(0, 500)
                  # ylim = c(0, 0.02)
                 ) +
  labs(title = "Variation in Routes Passing a Specific Stop",
       x = "Occurrences of Route-StopID Combiantions",
       y = "Counts"
      )

RouteStop_Cnts_Bar

```


Create a new dataset limiting extremely small counts of Route-StopID combinations.
```{r}

WaitTime_RteCnts <- left_join(WaitTime_AsNum,
                              RouteStop_Cnts,
                              by = c("RouteStop_ID" = "RouteStop_ID")
                             ) %>% 
  select(-RouteStop_CntPct)

dim(WaitTime_AsNum)
dim(WaitTime_RteCnts)

rm(WaitTime_AsNum)
str(WaitTime_RteCnts)


# Total rows
nrow(WaitTime_RteCnts)

# Rows of rare RouteStops
nrow(filter(WaitTime_RteCnts,
            RouteStop_CntNum <= 60
           )
    ) / nrow(WaitTime_RteCnts)

# Rows of extremely long wait times
nrow(filter(WaitTime_RteCnts,
            WaitTime_Min2 > 180
           )
    ) / nrow(WaitTime_RteCnts)


select(WaitTime_RteCnts,
       WaitTime_Min2
      ) %>% 
  summary()

filter(WaitTime_RteCnts,
       RouteStop_CntNum > 60  # 12 passes per day in a 5-day dataset
      ) %>% 
  select(WaitTime_Min2) %>% 
  summary()

filter(WaitTime_RteCnts,
       WaitTime_Min2 < 180  # probably means that something went wrong
      ) %>% 
  select(WaitTime_Min2) %>% 
  summary()

```


Compare quantiles in the limited datasets.
```{r}

a <- as.data.frame(select(WaitTime_RteCnts,
                          WaitTime_Min2
                         ) %>% 
                     quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
                  )

b <- as.data.frame(filter(WaitTime_RteCnts,
                          RouteStop_CntNum > 60
                         ) %>% 
                     select(WaitTime_Min2) %>% 
                     quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
                  )

c <- as.data.frame(filter(WaitTime_RteCnts,
                          WaitTime_Min2 < 180
                         ) %>% 
                     select(WaitTime_Min2) %>% 
                     quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
                  )

WT_Filter_Quantiles <- bind_cols(a, b, c) %>% 
  mutate(Quantile = seq(0, 1, 0.01)
        )

colnames(WT_Filter_Quantiles) <- c("All", "RteStpAbv60", "WTBlw180", "Quantile")
rm(a, b, c)
View(WT_Filter_Quantiles)
WT_Filter_Quantiles

```


Histogram of all wait times.
```{r}

WaitTime_AllBus_HistDen <- ggplot(filter(select(WaitTime_RteCnts,
                                                WaitTime_Min2
                                               ),
                                         !is.na(WaitTime_Min2)
                                        ),
                                  aes(x = WaitTime_Min2,
                                      y = ..density..
                                     )
                                ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  scale_x_continuous(breaks = seq(0, 300, 30)
                    ) +
  coord_cartesian(xlim = c(0, 300),
                  ylim = c(0, 0.035)
                 ) +
  labs(title = "Variation in Wait Time",
       x = "Wait Time (min)",
       y = "Density"
      )

WaitTime_AllBus_HistDen

```


Box plots for WaitTime (all busses, by Zip Code).
```{r}

# Count_Values is needed to display the medians on the box plots
BusRoute <- select(WaitTime_RteCnts,
                   Route,
                   WaitTime_Min2,
                   Stop_Zip
                  ) %>% 
  filter(Route == "X2")

CountValues_AllBus_Zip <- ddply(BusRoute,
                                .(Stop_Zip),
                                summarise,
                                Value_Counts = median(WaitTime_Min2, na.rm = TRUE)
                               )

WaitTime_AllBus_Zip_Box <- ggplot(BusRoute,
                                  aes(factor(Stop_Zip),
                                      WaitTime_Min2,
                                      fill = factor(Stop_Zip)
                                     )
                                 ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_Zip,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for the X2)",
       x = "Zip Code of Destination",
       y = "Waiting Time (min)"
      )

WaitTime_AllBus_Zip_Box

```


Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).
```{r}

WaitTime_AllBus_Zip_Violin <- ggplot(BusRoute,
                                     aes(factor(Stop_Zip),
                                         WaitTime_Min2,
                                         fill = factor(Stop_Zip)
                                        )
                                    ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_Zip,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for the X2)",
       x = "Zip Code of Destination",
       y = "Waiting Time (min)"
      )

TimeBtwEvents_X2_ViolinPlot_z

```


Box plots for WaitTime (Zip Code, by HourGroupZip).
```{r}

# Count_Values is needed to display the medians on the box plots
Zip <- select(WaitTime_RteCnts,
              Route,
              WaitTime_Min2,
              Stop_Zip,
              Event_Time_HrGroup
             ) %>% 
  filter(Stop_Zip == 20002)

CountValues_AllBus_HG <- ddply(Zip,
                               .(Event_Time_HrGroup),
                               summarise,
                               Value_Counts = median(WaitTime_Min2,
                                                     na.rm = TRUE
                                                    )
                               )

WaitTime_AllBus_HG_Box <- ggplot(Zip,
                                 aes(factor(Event_Time_HrGroup),
                                     WaitTime_Min2,
                                     fill = factor(Event_Time_HrGroup)
                                    )
                                ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_HG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
       x = "Hour Group",
       y = "Waiting Time (min)"
      )
  # facet_wrap(~Stop_Zip
  #            # nrow = 5
  #           )

WaitTime_AllBus_HG_Box

```


Violin plots for WaitTime (Zip Code, by HourGroupZip).
```{r}

WaitTime_AllBus_HG_Vln <- ggplot(Zip,
                                 aes(factor(Event_Time_HrGroup),
                                     WaitTime_Min2,
                                     fill = factor(Event_Time_HrGroup)
                                    )
                                ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_HG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 90)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
       x = "Hour Group",
       y = "Waiting Time (min)"
      )
  # facet_wrap(~Stop_Zip
  #            # nrow = 5
  #           )

WaitTime_AllBus_HG_Vln

```


Box plots for WaitTime (Route, by HourGroupZip).
```{r}

# Count_Values is needed to display the medians on the box plots
Rte <- select(WaitTime_RteCnts,
              Route,
              WaitTime_Min2,
              Stop_Zip,
              Event_Time_HrGroup
             ) %>% 
  filter(Route == "X2")

CountValues_AllBus_RteHG <- group_by(Rte,
                                     Event_Time_HrGroup
                                    ) %>% 
  summarise(
    Value_Counts = median(WaitTime_Min2,
                          na.rm = TRUE
                         ),
    VC = quantile(WaitTime_Min2, probs = 0.9, na.rm = TRUE)
    )


WaitTime_AllBus_RteHG_Box <- ggplot(Rte,
                                    aes(factor(Event_Time_HrGroup),
                                        WaitTime_Min2,
                                        fill = factor(Event_Time_HrGroup)
                                       )
                                   ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_RteHG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, max(CountValues_AllBus_RteHG$VC))
                 ) +
  labs(title = "Waiting Time at a Given Stop",
       subtitle = ("Route X2"),
       x = "Hour Group",
       y = "Waiting Time (min)"
      ) 
# +
#   facet_wrap(~Stop_Zip
#              # nrow = 5
#             )

WaitTime_AllBus_RteHG_Box

```


Violin plots for WaitTime (Zip Code, by HourGroupZip).
```{r}

WaitTime_AllBus_RteHG_Vln <- ggplot(Rte,
                                    aes(factor(Event_Time_HrGroup),
                                        WaitTime_Min2,
                                        fill = factor(Event_Time_HrGroup)
                                       )
                                   ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_RteHG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop",
       subtitle = ("(Route X2)"),
       x = "Hour Group",
       y = "Waiting Time (min)"
      ) +
  facet_wrap(~Stop_Zip
             # nrow = 5
            )

WaitTime_AllBus_RteHG_Vln

```


X2 Percentiles Line Graph Test.
```{r}

X2_Pct <- select(WaitTime_RteCnts,
                 Route,
                 Stop_Zip,
                 Event_Time_Date,
                 Event_Time_Day,
                 Event_Time_HrGroup,
                 Event_Time_Hr,
                 Latitude,
                 Longitude,
                 WaitTime_Min2
                ) %>% 
  filter(Route == "X2") %>% 
  group_by(Event_Time_Hr,
           Stop_Zip
          ) %>% 
  summarise(Pct50 = quantile(WaitTime_Min2, probs = 0.5, na.rm = TRUE),
            Pct60 = quantile(WaitTime_Min2, probs = 0.6, na.rm = TRUE),
            Pct70 = quantile(WaitTime_Min2, probs = 0.7, na.rm = TRUE),
            Pct80 = quantile(WaitTime_Min2, probs = 0.8, na.rm = TRUE),
            Pct90 = quantile(WaitTime_Min2, probs = 0.9, na.rm = TRUE)
           )

str(X2_Pct)
View(X2_Pct)


X2_Long <- gather(X2_Pct,
                  key = Percentile,
                  value = Pctile,
                  Pct50,
                  Pct60,
                  Pct70,
                  Pct80,
                  Pct90
                )

str(X2_Long)
View(X2_Long)


X2_WaitByHr_Line <- ggplot(X2_Long,
                           aes(x = Event_Time_Hr,
                               y = Pctile,
                               factor(Percentile),
                               color = Percentile
                              )
                          ) +
  geom_line() +
  theme(legend.title=element_blank(),
        legend.position = "bottom"
       ) +
  coord_cartesian(xlim = c(0, 23)
                  # ylim = c(0, 45)
                 ) + 
  scale_x_continuous(breaks = seq(0, 23, 2)
                    ) +
  labs(title = "Waiting Time Throughout the Day",
       subtitle = ("(Route X2)"),
       x = "Hour of the Day",
       y = "Waiting Time (min)"
      ) +
  facet_wrap(~Stop_Zip)

X2_WaitByHr_Line

```



GET DATA READY FOR SHINY  --  GET DATA READY FOR SHINY  --  GET DATA READY FOR SHINY
GET DATA READY FOR SHINY  --  GET DATA READY FOR SHINY  --  GET DATA READY FOR SHINY
GET DATA READY FOR SHINY  --  GET DATA READY FOR SHINY  --  GET DATA READY FOR SHINY

BaseData: Used in plots by hour and zipcode (first two Shiny tabs).
```{r}

# str(WaitTime_RteCnts)

Shiny_WaitData_Base <- select(WaitTime_RteCnts,
                              Route,
                              Stop_Zip,
                              Event_Time,
                              Event_Time_Date,
                              Event_Time_Day,
                              Event_Time_HrGroup,
                              Event_Time_Hr,
                              Latitude,
                              Longitude,
                              WaitTime_Min2
                             ) %>% 
  mutate(Event_Time_YrMthDayHr = floor_date(Event_Time, "hour")
        ) %>% 
  rename(ZipCode = Stop_Zip,
         HourGroup = Event_Time_HrGroup,
         Date = Event_Time_Date,
         Day = Event_Time_Day,
         Hour = Event_Time_Hr,
         WaitTime_Min = WaitTime_Min2
        ) %>% 
  filter(WaitTime_Min <= 180)

Shiny_WaitData_Base$Route <- factor(Shiny_WaitData_Base$Route)

str(Shiny_WaitData_Base)
View(tail(Shiny_WaitData_Base, 500))

saveRDS(Shiny_WaitData_Base,
        "Shiny_WaitData_Base.rds"
       )

```


Prep data for mapping.
```{r}

# devtools::install_github("dkahle/ggmap")
# devtools::install_github("hadley/ggplot2")
# install.packages("ggmap", type = "source")

# devtools::install_github('hadley/ggplot2')
devtools::install_github("hadley/ggplot2@v2.2.0")
# devtools::install_github('thomasp85/ggforce')
# devtools::install_github('thomasp85/ggraph')
# devtools::install_github('slowkow/ggrepel')


tract <- 
  readOGR(dsn = "/Users/mdturse/Desktop/Analytics/DCMetroBus/tl_2016_us_zcta510",
          layer = "tl_2016_us_zcta510"
         )
  
class(tract)

# convert the GEOID to a character
tract@data$GEOID <- as.character(tract@data$GEOID)
str(tract@data)


ggtract <- tidy(tract, region = "GEOID")

# str(ggtract)
# summary(ggtract)
# View(head(ggtract, 50))



# str(Shiny_WaitData_Base)

ZipWaitTest <- filter(Shiny_WaitData_Base,
                      WaitTime_Min <= 180 &
                        !is.na(ZipCode)
                     ) %>% 
  group_by(ZipCode,
           Event_Time_YrMthDayHr
           # Event_Time_Day,
           # Event_Time_Hr
          ) %>% 
  summarise(Pct80 = quantile(WaitTime_Min, probs = 0.8, na.rm = TRUE)
           ) %>% 
  arrange(# Event_Time_Hr,
          ZipCode,
          Event_Time_YrMthDayHr
         ) %>% 
  as.data.frame() %>% 
  mutate(Event_Time_DateNew = floor_date(Event_Time_YrMthDayHr, "day"),
         Event_Time_HrNew = hour(Event_Time_YrMthDayHr),
         Pct80_Level = factor(ifelse(Pct80 < 10,
                                     "Below 10",
                              ifelse(Pct80 < 20,
                                     "Below 20",
                              ifelse(Pct80 < 30,
                                     "Below 30",
                              ifelse(Pct80 < 40,
                                     "Below 40",
                              ifelse(Pct80 < 50,
                                     "Below 50",
                              ifelse(Pct80 < 60,
                                     "Below 60",
                                     "60 Plus"
                                    )))))),
                              levels = c("Below 10", "Below 20", "Below 30", 
                                         "Below 40", "Below 50", "Below 60", "60 Plus"
                                        ),
                              ordered = TRUE
                             )
        )

str(ZipWaitTest)
ZipWaitTest$ZipCode <- as.character(ZipWaitTest$ZipCode)
str(ZipWaitTest)
summary(ZipWaitTest)

View(head(ZipWaitTest, 500))


StopZip_Left <- left_join(ZipWaitTest,
                          ggtract,
                          by = c("ZipCode" = "id")
                         )

str(StopZip_Left)
summary(StopZip_Left)

```


Test mapping functionaltiy.
```{r}

map <- get_map(location = c(lon = -77.03676, lat = 38.89784),
               source = "google",
               # maptype = "roadmap"
               zoom = 12
              )

ggmap(map) +
  geom_polygon(aes(x = long, 
                   y = lat, 
                   group = group,
                   fill = Pct80_Level
                  ), 
               data = filter(StopZip_Left,
                             Event_Time_YrMthDayHr == as.POSIXct("2016-10-07 20:00:00")
                             # &
                             #   Stop_Zip == "20003"
                            ),
               colour = "gray1", 
               # fill = 'black', 
               alpha = .4, 
               size = .3
              ) +
# +
  # scale_fill_gradientn(colours = c("white", "royalblue4", "red"),
  #                      #  "lightsteelblue4",
  #                      # "lightpink1",
  #                      # values=cbPalette,
  #                      # values = c(1,0.5, .3, .2, .1, 0)
  #                      na.value = "black",
  #                      breaks = c(seq(0, 180, 30))
  #                      # values = rescale()
  #                     ) 
# +
  scale_fill_brewer(palette = "Spectral", # "YlOrRd" # "Set1",
                    direction = -1,
                    limits = levels(StopZip_Left$Pct80_Level)
                   )

```


Shiny data for mapping (used in 3rd tab).
```{r}

View(head(filter(StopZip_Left,
                 Event_Time_HrNew == 15
                ),
          500
         )
    )

Shiny_WaitData_Map <- StopZip_Left %>% 
  rename(YrMthDayHr = Event_Time_YrMthDayHr,
         YrMthDay = Event_Time_DateNew,
         Hour = Event_Time_HrNew
        )

str(Shiny_WaitData_Map)


Shiny_WaitData_Map_Wed <- filter(Shiny_WaitData_Map,
                                 YrMthDay == as.POSIXct("2016-10-05")
                                )

str(Shiny_WaitData_Map_Wed)
summary(Shiny_WaitData_Map_Wed)


saveRDS(Shiny_WaitData_Map,
        "Shiny_WaitData_Map.rds"
       )

saveRDS(Shiny_WaitData_Map_Wed,
        "Shiny_WaitData_Map_Wed.rds"
       )

```




Clustering

Data prep.
```{r}

rm(tract, ggtract, StopZip_Left, ZipWaitTest, Shiny_WaitData_Base, Shiny_WaitData_Map, Shiny_WaitData_Map_Wed)


dim(NewTravTime)
dim(WaitTime_RteCnts)


str(select(NewTravTime,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(select(NewTravTime,
           matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(WaitTime_RteCnts)

 
 
# ClustData <- select(WaitTime_RteCnts,
#                     group,
#                     BusDay_EventNum,
#                     Route,
#                     RteChange2,
#                     RouteAlt,
#                     DirChange2,
#                     Route_Direction,
#                     Stop_Sequence,
#                     StopID_Indicator,
#                     Stop_County,
#                     Stop_City,
#                     Stop_Zip,
#                     Event_Time_Hr,
#                     Dwell_Time2,
#                     TravelDistance_Mi_NewHvrs,
#                     TravelDistance_Mi_NewHvrs_Label,
#                     TT_Sec_New,
#                     TT_Sec_New_Label,
#                     WaitTime_Min2
#                    ) %>% 
#   filter(WaitTime_Min2 <= 180) %>% 
#   mutate(SpeedAvg_Mph_TDMNH_TTSN = TravelDistance_Mi_NewHvrs / 
#            (TT_Sec_New / 60 / 60)
#         )
# %>% 
#   select_if(ClustData,
#             function(col) is.numeric(col) |
#               is.integer(col)
#            ) %>% 
#    scale()

# str(ClustData)
# View(tail(ClustData, 500))
# rownames(ClustData) <- ClustData$Route
# ClustData$Route <- as.factor(ClustData$Route)
# str(ClustData)
# head(ClustData)


RouteStats <- filter(WaitTime_RteCnts,
                     WaitTime_Min2 <= 180
                    ) %>% 
  mutate(SpeedAvg_Mph_TDMNH_TTSN = TravelDistance_Mi_NewHvrs / (TT_Sec_New / 60 / 60)
        ) %>% 
  group_by(Route) %>% 
  summarise(BusDayEventNum_Mean = mean(BusDay_EventNum, na.rm = TRUE),
            BusDayEventNum_Pct10 = quantile(BusDay_EventNum, probs = 0.10, na.rm = TRUE),
            BusDayEventNum_Pct25 = quantile(BusDay_EventNum, probs = 0.25, na.rm = TRUE),
            BusDayEventNum_Pct50 = quantile(BusDay_EventNum, probs = 0.50, na.rm = TRUE),
            BusDayEventNum_Pct75 = quantile(BusDay_EventNum, probs = 0.75, na.rm = TRUE),
            BusDayEventNum_Pct90 = quantile(BusDay_EventNum, probs = 0.90, na.rm = TRUE),
            StopSequence_Mean = mean(Stop_Sequence, na.rm = TRUE),
            StopSequence_Pct10 = quantile(Stop_Sequence, probs = 0.10, na.rm = TRUE),
            StopSequence_Pct25 = quantile(Stop_Sequence, probs = 0.25, na.rm = TRUE),
            StopSequence_Pct50 = quantile(Stop_Sequence, probs = 0.50, na.rm = TRUE),
            StopSequence_Pct75 = quantile(Stop_Sequence, probs = 0.75, na.rm = TRUE),
            StopSequence_Pct90 = quantile(Stop_Sequence, probs = 0.90, na.rm = TRUE),
            EventTimeHr_Mean = mean(Event_Time_Hr, na.rm = TRUE),
            EventTimeHr_Pct10 = quantile(Event_Time_Hr, probs = 0.10, na.rm = TRUE),
            EventTimeHr_Pct25 = quantile(Event_Time_Hr, probs = 0.25, na.rm = TRUE),
            EventTimeHr_Pct50 = quantile(Event_Time_Hr, probs = 0.50, na.rm = TRUE),
            EventTimeHr_Pct75 = quantile(Event_Time_Hr, probs = 0.75, na.rm = TRUE),
            EventTimeHr_Pct90 = quantile(Event_Time_Hr, probs = 0.90, na.rm = TRUE),
            DwellTime2_Mean = mean(Dwell_Time2, na.rm = TRUE),
            DwellTime2_Pct10 = quantile(Dwell_Time2, probs = 0.10, na.rm = TRUE),
            DwellTime2_Pct25 = quantile(Dwell_Time2, probs = 0.25, na.rm = TRUE),
            DwellTime2_Pct50 = quantile(Dwell_Time2, probs = 0.50, na.rm = TRUE),
            DwellTime2_Pct75 = quantile(Dwell_Time2, probs = 0.75, na.rm = TRUE),
            DwellTime2_Pct90 = quantile(Dwell_Time2, probs = 0.90, na.rm = TRUE),
            TravDistMi_Mean = mean(TravelDistance_Mi_NewHvrs, na.rm = TRUE),
            TravDistMi_Pct10 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.10, na.rm = TRUE
                                       ),
            TravDistMi_Pct25 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.25, na.rm = TRUE
                                       ),
            TravDistMi_Pct50 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.50, na.rm = TRUE
                                       ),
            TravDistMi_Pct75 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.75, na.rm = TRUE
                                       ),
            TravDistMi_Pct90 = quantile(TravelDistance_Mi_NewHvrs,
                                        probs = 0.90, na.rm = TRUE
                                       ),
            TravTimSec_Mean = mean(TT_Sec_New, na.rm = TRUE),
            TravTimSec_Pct10 = quantile(TT_Sec_New, probs = 0.10, na.rm = TRUE),
            TravTimSec_Pct25 = quantile(TT_Sec_New, probs = 0.25, na.rm = TRUE),
            TravTimSec_Pct50 = quantile(TT_Sec_New, probs = 0.50, na.rm = TRUE),
            TravTimSec_Pct75 = quantile(TT_Sec_New, probs = 0.75, na.rm = TRUE),
            TravTimSec_Pct90 = quantile(TT_Sec_New, probs = 0.90, na.rm = TRUE),
            WaitTimMin_Mean = mean(WaitTime_Min2, na.rm = TRUE),
            WaitTimMin_Pct10 = quantile(WaitTime_Min2, probs = 0.10, na.rm = TRUE),
            WaitTimMin_Pct25 = quantile(WaitTime_Min2, probs = 0.25, na.rm = TRUE),
            WaitTimMin_Pct50 = quantile(WaitTime_Min2, probs = 0.50, na.rm = TRUE),
            WaitTimMin_Pct75 = quantile(WaitTime_Min2, probs = 0.75, na.rm = TRUE),
            WaitTimMin_Pct90 = quantile(WaitTime_Min2, probs = 0.90, na.rm = TRUE)
           ) %>% 
  as.data.frame()

str(RouteStats)

rownames(RouteStats) <- RouteStats$Route
str(RouteStats)
View(RouteStats)


RouteStats_Scaled <- select(RouteStats,
                            -Route
                           ) %>% 
  scale()

str(RouteStats_Scaled)
class(RouteStats_Scaled)
View(RouteStats_Scaled)

summary(RouteStats)
summary(RouteStats_Scaled)

# <- select_if(ClustData,
#                               function(col) is.numeric(col) |
#                                 is.integer(col)
#                              ) %>% 
  # scale() %>% 
  # as.data.frame() %>% 
  # na.omit()

# str(ClustData_NoFact)
# summary(ClustData_NoFact)

```


PCA
```{r}

Trnsfrm <- preProcess(select(RouteStats,
                             -Route
                            ),
                      method = c("BoxCox", "center", "scale", "pca")
                     )

# loadings
Trnsfrm$rotation

RouteStats_Pca <- predict(Trnsfrm, RouteStats) %>% 
  select(-Route)
RouteStats_Pca

```


Clustering.

Are the data clusterable?
```{r}

##### Are the data clusterable?
# gradient_col <- list(low = "steelblue", high = "white")
ClustData_Ends <- get_clust_tendency(RouteStats_Pca,
                                     n = nrow(RouteStats_Pca
                                             ) - 1,
                                     # gradient = gradient_col,
                                     seed = 123456789
                                    )

str(ClustData_Ends)

# Hopkins statistic
ClustData_Ends$hopkins_stat  # value of 0.1657494 implies that the data are not uniformly distributed (they are "clusterable")

#plot
ClustData_Ends$plot

```


Clustering. How many clusters are there?

kmeans, pam, and hierarchical clustring methods, using within sum of squares and silhouette measures.
```{r}

# class(RouteStats_Pca)

fviz_nbclust(RouteStats_Pca, kmeans, method = "wss")  # ~8 clusters
fviz_nbclust(RouteStats_Pca, pam, method = "wss")  # ~6 clusters
fviz_nbclust(RouteStats_Pca, hcut, method = "wss")  # ~6 clusters

fviz_nbclust(RouteStats_Pca, kmeans, method = "silhouette")  # 2 clusters
fviz_nbclust(RouteStats_Pca, pam, method = "silhouette")  # 2 clusters
fviz_nbclust(RouteStats_Pca, hcut, method = "silhouette",
             hc_method = "complete")  # 2 clusters

```


Clustering. How many clusters are there?

kmeans method with the gap statistic, using bootstrap.
```{r}

# Compute gap statistic
# kmeans version
set.seed(123456789)
# system.time(
gap_stat_km <- clusGap(RouteStats_Pca,
                       FUN = kmeans,
                       nstart = 25,
                       K.max = 10,
                       B = 500
                      )
# )

# Print
print(gap_stat_km, method = "Tibs2001SEmax")
print(gap_stat_km)


# pam version
set.seed(123456789)
gap_stat_pm <- clusGap(RouteStats_Pca,
                       FUN = pam,
                       K.max = 10,
                       B = 500
                      )

# Print
print(gap_stat_pm, method = "Tibs2001SEmax")
print(gap_stat_pm)


# hierarchical version
set.seed(123456789)
gap_stat_hcut <- clusGap(RouteStats_Pca,
                         FUN = hcut,
                         K.max = 10,
                         B = 500
                        )

# Print
print(gap_stat_hcut, method = "Tibs2001SEmax")
print(gap_stat_hcut)



# Plot kmeans
fviz_gap_stat(gap_stat_km, 
              maxSE = list(method = "Tibs2001SEmax")
             )  # 1 cluster

# Plot pam
fviz_gap_stat(gap_stat_pm, 
              maxSE = list(method = "Tibs2001SEmax")
             )  # 2 cluster

# Plot hierarchical
fviz_gap_stat(gap_stat_hcut, 
              maxSE = list(method = "Tibs2001SEmax")
             )  # 1 cluster

```


Clustering. How many clusters are there?

kmeans method with various different statistics.
```{r}

# str(iris)

nb <- NbClust(RouteStats_Pca, #scale(iris[ ,-5]),
              distance = "euclidean",
              min.nc = 2,
              max.nc = 15,
              method = "kmeans",
              index = "all"
             )

fviz_nbclust(nb) + theme_minimal()

```


Clustering. How many clusters are there?

Hierarchical clustering method. Particularly looking at silhouette statistics.
```{r}

# Hierarchical clustering, cut in 2 to 15 groups
for(i in 2:15) {
  assign(paste0("HCRes_K", i),
         eclust(RouteStats_Pca,
                "hclust",
                k = i,
                method = "complete",
                graph = FALSE
               )
        )
  
  assign("x",
         get(paste0("HCRes_K", i)
            )
        )
  
  assign(paste0("HCStats_K", i),
         cluster.stats(dist(RouteStats_Scaled,
                            method ="euclidean"
                           ),
                       x$cluster
                      )
        )
  
  assign("y",
         get(paste0("HCStats_K", i)
            )
        )
  
  assign(paste0("HCDend_K", i),
         fviz_dend(x, rect = TRUE, show_labels = FALSE)
        )
  
  assign(paste0("HCSil_K", i),
         fviz_silhouette(x)
        )
  
  assign(paste0("HCSilWidth_K", i),
         as.data.frame(y$clus.avg.silwidths) %>% 
           mutate(KVal = 1:nrow(.)
                 )
        )
  }


HCSilWidth_AllK <- left_join(select(HCSilWidth_K15,
                                    KVal,
                                    `y$clus.avg.silwidths`
                                   ),
                             HCSilWidth_K14,
                             by = c("KVal" = "KVal")
                            ) %>% 
  left_join(.,
            HCSilWidth_K13,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K12,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K11,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K10,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K9,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K8,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K7,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K6,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K5,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K4,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K3,
            by = c("KVal" = "KVal")
           ) %>% 
  left_join(.,
            HCSilWidth_K2,
            by = c("KVal" = "KVal")
           )
  
colnames(HCSilWidth_AllK) <- c("KVal", "K15", "K14", "K13", "K12", "K11", "K10", "K9",
                               "K8", "K7", "K6", "K5", "K4", "K3", "K2"
                              )


# Visualize
HCDend_K2
HCDend_K3
HCDend_K4
HCDend_K5
HCDend_K6
HCDend_K7
HCDend_K8
HCDend_K9
HCDend_K10
HCDend_K11
HCDend_K12
HCDend_K13
HCDend_K14
HCDend_K15

HCSil_K2
HCSil_K3
HCSil_K4
HCSil_K5
HCSil_K6
HCSil_K7
HCSil_K8
HCSil_K9
HCSil_K10
HCSil_K11
HCSil_K12
HCSil_K13
HCSil_K14
HCSil_K15


HCSilWidth_AllK

```


Using kmeans, PAM, and Hierarchical clustering methods, we can say we probably have aroun 2 clusters.

Let's try density clustering.  (This tends to show that maybe there is only one "cluster," meaning that data are not clusterable.)
```{r}

rm(list = ls(pattern = "_K")
  )


# Compute DBSCAN using fpc package
kNNdistplot(RouteStats_Pca, k = 10)
abline(h = 8.5, lty = 2)

set.seed(123456789)
db <- fpc::dbscan(RouteStats_Pca,
                  eps = 8.5,
                  MinPts = 10
                )

str(db)
db

# Plot DBSCAN results
fviz_cluster(db,
             RouteStats_Pca,
             stand = FALSE,
             frame = FALSE,
             geom = "point"
            )

```




Investigating TravelTime_Sec.
```{r}

View(filter(TTLargeRteChng,
            !is.na(TravelTime_Sec) &
              RteChange2 == "Same"
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )


# examples where TravelTime_Sec is small (1 sec) and SpeedAvg_Mph_NewHvrs is large.
View(select(NewTravTime,
            # -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
            -(TD_Mi_q2:TD_Mi_SSHG_Cnt_F),
            -(TT_Hr_q2:TT_Hr_SSHG_Cnt_F)
           ) %>% 
       filter((RowNum_OG >= 2217353 & RowNum_OG <= 2217373) | # 2217363
                (RowNum_OG >= 3090321 & RowNum_OG <= 3090341) | # 3090331
                (RowNum_OG >= 80764 & RowNum_OG <= 80784) | # 80774
                (RowNum_OG >= 33840 & RowNum_OG <= 33860) # 33850
           )
    )






# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(filter(TTLargeRteChng,
            (RowNum_OG >= 2250290 & RowNum_OG <= 2250310) | # 2250300
              (RowNum_OG >= 867717 & RowNum_OG <= 867737) | # 867727
              (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
              (RowNum_OG >= 808395 & RowNum_OG <= 808415) # 808405
           )
    )
```




```{r}

         
         
# examples where TravelTime_Sec is unusually small (with TravelDistance_Mi values that are large).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1042228 & RowNum_OG <= 1042248) | # 1042238
                (RowNum_OG >= 53816 & RowNum_OG <= 53836) | # 53826
                (RowNum_OG >= 360571 & RowNum_OG <= 360591) | # 360581
                (RowNum_OG >= 502271 & RowNum_OG <= 502291) # 502281 (can't explian the weird TravelTime_Sec calculation here - it's not even an integer!)
           )
    )

# still trying to explain 502281...on the day of this weirdness, the bus was only in circulation for 4-5 stops (~20 minutes) on that day (Oct 6)
View(filter(AllDays_NewTravelDist,
            Bus_ID == 2711
           )
    )


# exploring large values for TravelTime_Sec
View(filter(AllDays_NewTravelDist,
            TravelTime_Sec == 300
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph2
              )
    )

# examples where TravelTime_Sec is unusually large (with TravelDistance_Mi values that are small, so SpeedAvg_Mph values are very small).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 2627459 & RowNum_OG <= 2627479) | # 2627469
                (RowNum_OG >= 2193344 & RowNum_OG <= 2193364) | # 2193354
                (RowNum_OG >= 1644123 & RowNum_OG <= 1644143) | # 1644133
                (RowNum_OG >= 869600 & RowNum_OG <= 869620) # 869610
           )
    )

```

Investigation of SpeedAvg_Mph2

View(Speed_Pctiles): 90% of SpeedAvg_Mph2 are between ~3mph and ~66mph.
```{r}

Speed_Ntile <- as.data.frame(AllDays_NewTravelDist$SpeedAvg_Mph2) %>% 
  mutate(Pctile = ntile(AllDays_NewTravelDist$SpeedAvg_Mph2, 100),
         MinR = min_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
         PctR = percent_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(Speed_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_Ntile)

Speed_Ntile_Rows <- nrow(Speed_Ntile)

View(tail(Speed_Ntile, 500))


Speed_Pctiles <- group_by(Speed_Ntile,
                          PctR_Round
                         ) %>% 
  summarise(
    MinSpeedAtPctile = min(SpeedAvg_Mph2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / Speed_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(Speed_Pctiles)

```

Investigation of SpeedAvg_Mph2.

Exploring the removal of outlier TravelTime_Sec and TravelDistance_Mi.
```{r}

summary(select(AllDays_NewTravelDist,
               SpeedAvg_Mph,
               SpeedAvg_Mph2
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      TravelDistance_Mi > 0.0001893939 & # lowest non-zero percentile
                        TravelDistance_Mi < 1.0812500000 & # 99th percentile
                        TravelTime_Sec > 10.050000 & # 2nd percentile
                        TravelTime_Sec < 293.000000 # 98th percentile
                     ),
               SpeedAvg_Mph,
               SpeedAvg_Mph2
              )
       )

```


Investigation of SpeedAvg_Mph2.

Histogram of SpeedAvg_Mph2.
```{r}

Speed_HistDen <- ggplot(filter(AllDays_NewTravelDist,
                               !is.na(SpeedAvg_Mph2)
                              ),
                        aes(x = SpeedAvg_Mph2,
                            y = ..density..
                           )
                       ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  stat_bin(binwidth = 5,
           geom = "text",
           size = 2.5,
           vjust = 1.5,
           aes(label = format(..count.., big.mark = ",")
              ),
          ) +
  # geom_text(aes(label = format(..count.., big.mark = ",")
  #              ),
  #           size = 3,
  #           nudge_y = (..count.. * 0.1)
  #          ) +
  coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Speed",
       x = "Average Speed (mph)",
       y = "Density"
      )

Speed_HistDen

```


Investigation of SpeedAvg_Mph2.

Histogram of SpeedAvg_Mph2 after removing outlier TravelTime_Sec and TravelDistance_Mi.
```{r}

View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)

SpeedNoOutlier_HistDen <- ggplot(filter(AllDays_NewTravelDist,
                                        !is.na(SpeedAvg_Mph2) &
                                          TravelDistance_Mi_New > 0.077841005 & # 5th percentile
                                          # TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
                                          TravelTime_Sec > 12.100000 # 4th percentile
                                          # TravelTime_Sec < 293.000000 # 98th percentile
                                       ),
                                 aes(x = SpeedAvg_Mph2,
                                     y = ..density..
                                    )
                                ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  stat_bin(binwidth = 5,
           geom = "text",
           size = 2.5,
           vjust = 1.5,
           aes(label = format(..count.., big.mark = ",")
              ),
          ) +
  # geom_text(aes(label = format(..count.., big.mark = ",")
  #              ),
  #           size = 3,
  #           nudge_y = (..count.. * 0.1)
  #          ) +
  coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Speed",
       subtitle = "(removed low outliers of Travel Distance and Travel Time)",
       x = "Average Speed (mph)",
       y = "Density"
      )

SpeedNoOutlier_HistDen

```


Investigation of SpeedAvg_Mph2.

New dataset (NoOutliers_TravelDistNTime) when removing outlier low values of TravelDistance_Mi_New and TravelTime_Sec.
```{r}

View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)

NoOutliers_TravelDistNTime <- filter(AllDays_NewTravelDist,
                                     TravelDistance_Mi_New > .077841005 & # 5th percentile
                                       # TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
                                       TravelTime_Sec > 12.100000 # 4th percentile
                                       # TravelTime_Sec < 293.000000 # 98th percentile
                                    )

nrow(AllDays_NewTravelDist) - nrow(NoOutliers_TravelDistNTime)

str(NoOutliers_TravelDistNTime)
summary(NoOutliers_TravelDistNTime)

```


Investigation of SppedAvg_Mph2.

View(Speed_NoOut_Pctiles):  Aproximately 90% of SpeedAvg_Mph2 values are between ~4mph and ~56mph.
```{r}

Speed_NoOut_Ntile <- as.data.frame(NoOutliers_TravelDistNTime$SpeedAvg_Mph2) %>% 
  mutate(Pctile = ntile(NoOutliers_TravelDistNTime$SpeedAvg_Mph2, 100),
         MinR = min_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
         PctR = percent_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(Speed_NoOut_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_NoOut_Ntile)

Speed_NoOut_Ntile_Rows <- nrow(Speed_NoOut_Ntile)

View(tail(Speed_NoOut_Ntile, 500))


Speed_NoOut_Pctiles <- group_by(Speed_NoOut_Ntile,
                                PctR_Round
                               ) %>% 
  summarise(
    MinSpeedAtPctile = min(SpeedAvg_Mph2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / Speed_NoOut_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(Speed_NoOut_Pctiles)

```


Investigation of SppedAvg_Mph2.

Exloring odd/impossible values.
```{r}

# Exploring when SpeedAvg_Mph2 is NA  --  does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
            is.na(SpeedAvg_Mph2)
           )
    )


# Exploring when SpeedAvg_Mph2 is zero  --  does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
            SpeedAvg_Mph2 == 0
           )
    )


# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
            SpeedAvg_Mph2 > 0 &
              SpeedAvg_Mph2 < 3.2848770
           ) %>% 
       arrange(SpeedAvg_Mph2)
    )

# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 485338 & RowNum_OG <= 485358) | # 485348  --  Extreme travel time, Route Change
                (RowNum_OG >= 346952 & RowNum_OG <= 346972) | # 346962  -- Extreme travel time, Route Change 
                (RowNum_OG >= 70494 & RowNum_OG <= 70514) | # 70504  --  Extreme travel time, Route Change
                (RowNum_OG >= 2051846 & RowNum_OG <= 2051866) # 2051856  --  Extreme travel time, Route Change
           )
    )

```


Investigation of SpeedAvg_Mph2.

Limit the dataset based on SpeedAvg_Mph2.
```{r}

NoOutliersSpeed <- filter(NoOutliers_TravelDistNTime,
                          between(SpeedAvg_Mph2,
                                  4.069300, # 5th percentile
                                  56.05651 #95th percentile
                                 )
                          )

nrow(NoOutliers_TravelDistNTime) - nrow(NoOutliersSpeed)

summary(NoOutliersSpeed)

```


TravelTime now looks like it has some odd values on the high end.  So let's look at those.

View(TravTime_NoOut_Pctiles):  Virtually all trips should take less than 5 minutes. (The 99th percentile of of TravelTime is approximately 8 minutes.)
```{r}

TravTime_NoOut_Ntile <- as.data.frame(NoOutliersSpeed$TravelTime_Hr) %>% 
  mutate(Pctile = ntile(NoOutliersSpeed$TravelTime_Hr, 100),
         MinR = min_rank(NoOutliersSpeed$TravelTime_Hr),
         PctR = percent_rank(NoOutliersSpeed$TravelTime_Hr),
         PctR_Round = round(PctR, 2)
        )

colnames(TravTime_NoOut_Ntile)[1] <- "TravelTime_Hr"
str(TravTime_NoOut_Ntile)

TravTime_NoOut_Ntile_Rows <- nrow(TravTime_NoOut_Ntile)

View(tail(TravTime_NoOut_Ntile, 500))


TravTime_NoOut_Pctiles <- group_by(TravTime_NoOut_Ntile,
                                   PctR_Round
                                  ) %>% 
  summarise(
    MinTravTimeHrAtPctile = min(TravelTime_Hr),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravTime_NoOut_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile),
         MinTravTimeSecAtPctile = MinTravTimeHrAtPctile * (60 * 60)
        )

View(TravTime_NoOut_Pctiles)

```


Investigating odd TravelTime_Sec values.

Trips longer than ~8 minutes.
```{r}

View(filter(NoOutliersSpeed,
            TravelTime_Sec > 491 # min at the 100th percentile
           ) %>% 
       arrange(desc(TravelTime_Sec)
              )
    )

# examples of TravelTime_Sec values that are largest.
View(filter(NoOutliersSpeed,
            (RowNum_OG >= 2071759 & RowNum_OG <= 2071779) | # 2071769  --  results from a route change, and a 3hr+ wait before the new route starts
                (RowNum_OG >= 1473686 & RowNum_OG <= 1473706) | # 1473696  --  results from a route change, and a 3hr wait before the new route starts
                (RowNum_OG >= 1222822 & RowNum_OG <= 1222842) | # 1222832  --  results from a route change, and a 3hr wait before the new route starts
                (RowNum_OG >= 3046089 & RowNum_OG <= 3046109) # 3046099  --  results from a route change, and a 3hr wait before the new route starts
           )
    )


# examples of TravelTime_Sec values that are the smallest of the large.
View(filter(NoOutliersSpeed,
            (RowNum_OG >= 3044689 & RowNum_OG <= 3044709) | # 3044699  --  results from a route change
                (RowNum_OG >= 3022358 & RowNum_OG <= 3022378) | # 3022368  --  results from a route change
                (RowNum_OG >= 2993016 & RowNum_OG <= 2993036) | # 2993026  --  results from a previous route change (change occurred in deleted row)
                (RowNum_OG >= 2683703 & RowNum_OG <= 2683723) # 2683713  --  results from a previous route change (change occurred in deleted row)
           )
    )

```


Let's look at the TravelTime_Sec values and route changes (DirChange2).

The 99th percentile of TravelTime_Sec for both, all trips, and just those trips NOT involving route changes (DirChange2 = "Same"), is approximately 5min (300 sec).

Nota Bene:  The percentile calculation here is defined slightly different than in most of the above analyses (which get the lowest value in the bin created by 100 ntiles).
```{r}

summary(select(NoOutliersSpeed,
               TravelTime_Sec
              )
       )

summary(select(filter(NoOutliersSpeed,
                      DirChange2 == "Same"
                     ),
               TravelTime_Sec
              )
       )

summary(select(filter(NoOutliersSpeed,
                      DirChange2 == "Change"
                     ),
               TravelTime_Sec
              )
       )


TravTimeSec_Qtiles_df <- data.frame(PctValue = seq(0, 100, 1),
                                    All = seq(1, 101, 1),
                                    Same = seq(1, 101, 1),
                                    Change = seq(1, 101, 1)
                                   )

TravTimeSec_Qtiles_df[ , 2] <- quantile(select(NoOutliersSpeed,
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

TravTimeSec_Qtiles_df[ , 3] <- quantile(select(filter(NoOutliersSpeed,
                                                      DirChange2 == "Same"
                                                     ),
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

TravTimeSec_Qtiles_df[ , 4] <- quantile(select(filter(NoOutliersSpeed,
                                                      DirChange2 == "Change"
                                                     ),
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

View(TravTimeSec_Qtiles_df)

```


Limit the dataset now based on TravelTime_Sec.
```{r}

UpperLimitTravTime <- filter(NoOutliersSpeed,
                             TravelTime_Sec <= 491 # min at the 100th percentile
                             )

nrow(NoOutliersSpeed) - nrow(UpperLimitTravTime)

str(UpperLimitTravTime)

summary(UpperLimitTravTime)

```


Investigation of Dwell_Time2 (how long the bus is at a stop).

Differences between Dwell_Time (by WMATA) and Dwell_Time2 (by me) appear to be due to switches in RouteAlt. WMATA calculates Dwell_Time by an unknown process. The WMATA calculation is equal to my calculation, except for the records immedaitely before and after a RouteAlt switch (DirChange2).
```{r}

View(filter(AllDays_NewOrder,
            Dwell_Time != Dwell_Time2
           )
    )


# Examples where the Dwell_Time and Dwell_Time2 are different
View(filter(AllDays_NewOrder,
            ( (RowNum_OG >= 65 & RowNum_OG <= 85) | # 75
                (RowNum_OG >= 162 & RowNum_OG <= 192) | # 172
                (RowNum_OG >= 431952 & RowNum_OG <= 431972) | # 431962
                (RowNum_OG >= 434595 & RowNum_OG <= 434615) # 434605  --  this record is NOT a route switch, but does has a Sequence switch (Me: should there really be a route switch here?)
            )
           )
    )

```


Investigation of Dwell_Time2 (how long the bus is at a stop).

First, create some "rank" stats.
View(DT2_Pctiles): 95% of Dwell_Time2s are <= 23 seconds...but some weird (e.g., nearly 2 hour Dwell_Time2s exist).
```{r}

DwellTime2_Ntile <- as.data.frame(AllDays_NewOrder$Dwell_Time2) %>% 
  mutate(Pctile = ntile(AllDays_NewOrder$Dwell_Time2, 100),
         MinR = min_rank(AllDays_NewOrder$Dwell_Time2),
         PctR = percent_rank(AllDays_NewOrder$Dwell_Time2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DwellTime2_Ntile)[1] <- "Dwell_Time2"
str(DwellTime2_Ntile)

DwellTime2_Ntile_Rows <- nrow(DwellTime2_Ntile)

View(tail(DwellTime2_Ntile, 500))


DwellTime2_Pctiles <- group_by(DwellTime2_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinDwellAtPctile = min(Dwell_Time2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DwellTime2_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DwellTime2_Pctiles)

```


Investigation of Dwell_Time2 (how long the bus is at a stop).

Histogram of Dwell_Time2.
```{r}

DwellTime2_HistDen <- ggplot(AllDays_NewOrder, aes(x = Dwell_Time2, y = ..density..)) +
  geom_histogram(binwidth = 1, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(1, 25), ylim = c(0, 0.05)
                 ) +
  xlab("Time a Bus Stays at a Stop (sec)") + 
  ylab("Density") + 
  #  theme(legend.position="none") + 
  ggtitle(expression(atop("Variation in How Long a Bus Stays at a Stop"
                          # ,atop(italic("xxxxx"),"")
                         )
                    )
         )

DwellTime2_HistDen

```


Investigation of Dwell_Time2 (how long the bus is at a stop).

Looking at some weirdly long Dwell_Time2 values.
```{r}

View(arrange(AllDays_NewOrder,
             desc(Dwell_Time2)
            )
    )


# examples of extremely large Dwell_Time2s
View(filter(AllDays_NewOrder,
            (RowNum_OG >= 292669 & RowNum_OG <= 292689) | # 292679
                (RowNum_OG >= 531057 & RowNum_OG <= 531077) | # 531067
                (RowNum_OG >= 1388627 & RowNum_OG <= 1388647) | # 1388637
                (RowNum_OG >= 1645711 & RowNum_OG <= 1645731) # 1645721
           )
    )


View(filter(AllDays_NewOrder,
            Dwell_Time2 == 0
           )
    )

```


Investigation of Delta_Time (how early or late the bus is).

View(DT2_Pctiles): 94% of Delta_Time values are between -236 seconds and 1,259 seconds. Roughly 66% of records are within 5 min late and 5 min early...but some weird (e.g., almost 50 minute late or 40 minute early) Delta_Times exist.

Note that Delta_Time is the difference from the scheduled bus arrival. So if two buses are scheduled to arrive at a destination at 10:00pm and 10:20pm, and if the 10:20pm bus has a Delta_Time of 5 minutes, there are 25 minutes between bus arrivals at the stop.

Also note that based on a comment at https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/, the Delta_Time values don't appear to coincide with published bus schedules (e.g., the X2 departing every 8 minutes during peak hours).
```{r}

DeltTime_Ntile <- as.data.frame(AllDays_NewOrder$Delta_Time) %>% 
  mutate(Pctile = ntile(AllDays_NewOrder$Delta_Time, 100),
         MinR = min_rank(AllDays_NewOrder$Delta_Time),
         PctR = percent_rank(AllDays_NewOrder$Delta_Time),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DeltTime_Ntile)[1] <- "Delta_Time"
str(DeltTime_Ntile)

DeltTime_Ntile_Rows <- nrow(DeltTime_Ntile)

View(tail(DeltTime_Ntile, 500))


DeltTime_Pctiles <- group_by(DeltTime_Ntile,
                             PctR_Round
                            ) %>% 
  summarise(
    MinDeltTimeAtPctile = min(Delta_Time),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DeltTime_Pctiles)
DeltTime_Pctiles

# ~66% of rows are between 5 min late and 5 min early
nrow(filter(AllDays_NewOrder,
            Delta_Time >= -300 &
              Delta_Time <= 300
           )
    ) / nrow(AllDays_NewOrder)


# examples of weird large Delta_Times
View(filter(AllDays_NewOrder,
            Delta_Time < -4202 |
              Delta_Time > 1705
           ) %>% 
       arrange(desc(Delta_Time)
              )
    )

```


Investigation of Delta_Time (how early or late the bus is).

Delta_Time histogram.
```{r}

DeltTime_HistDen <- ggplot(AllDays_NewOrder, aes(x = (Delta_Time / 60),
                                                 y = ..density..
                                                )
                          ) +
  geom_histogram(binwidth = (5/60), fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(-5, 5)) +
  xlab("Bus Lateness (min)") + 
  ylab("Density") + 
  #  theme(legend.position="none") + 
  ggtitle(expression(atop("Variation in How Early/Late a Bus Is",
                          atop(italic("(positive values are late arrivals)"),
                               ""
                              )
                         )
                    )
         )

DeltTime_HistDen

```


Investigation of Delta_Time (how early or late the bus is).

Delta_Time boxplot.
```{r}

# Count_Values is needed to display the medians on the box plots
Count_Values <- ddply(AllDays_NewOrder,
                      .(Event_Time_HrGroup),
                      summarise,
                      Value_Counts = median(Delta_Time / 60, na.rm = TRUE)
                     )

DeltTime_BoxPlot <- ggplot(AllDays_NewOrder,
                           aes(factor(Event_Time_HrGroup),
                               Delta_Time / 60,
                               fill = factor(Event_Time_HrGroup)
                              )
                          ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE) + 
  # coord_cartesian(ylim = c(-300, 1200)) +
  coord_cartesian(ylim = c(-5, 20)) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  xlab("Hour Group") + 
  ylab("Bus Lateness (minutes)") + 
  theme(legend.position="none", axis.text.x = element_text(angle=45)) + 
  #theme(legend.position="right", axis.text.x = element_blank()) + 
  ggtitle(expression(atop("How Early/Late is the Bus (by Hour Group)",
                          atop(italic("(positive values are late arrivals)"),
                               ""
                              )
                         )
                    )
         )

DeltTime_BoxPlot

```


Investigation of Delta_Time (how early or late the bus is).

Exploring "extreme" Delta_Times.  First let's get some "rank" stats.
```{r}

View(DeltTime_Pctiles)
DeltTime_Pctiles


DeltTimeAbs_Ntile <- as.data.frame(abs(AllDays_NewOrder$Delta_Time)) %>% 
  mutate(Pctile = ntile(abs(AllDays_NewOrder$Delta_Time), 100),
         MinR = min_rank(abs(AllDays_NewOrder$Delta_Time)),
         PctR = percent_rank(abs(AllDays_NewOrder$Delta_Time)),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DeltTimeAbs_Ntile)[1] <- "Delta_Time_Abs"
str(DeltTimeAbs_Ntile)

DeltTimeAbs_Ntile_Rows <- nrow(DeltTimeAbs_Ntile)

View(tail(DeltTimeAbs_Ntile, 500))


DeltTimeAbs_Pctiles <- group_by(DeltTimeAbs_Ntile,
                                PctR_Round
                               ) %>% 
  summarise(
    MinDeltTimeAtPctile = min(Delta_Time_Abs),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DeltTimeAbs_Pctiles)
DeltTimeAbs_Pctiles

```


Investigation of Delta_Time (how early or late the bus is).

Exploring "extreme" Delta_Times.  Then let's calculate the percentage of buses that are 10 minutes (or more) late/early.
```{r}

HrGroup_DeltaTime_All <- group_by(AllDays_NewOrder,
                                  Event_Time_HrGroup
                                 ) %>% 
  summarise(EventAll_Cnt = n()
           )

str(HrGroup_DeltaTime_All)
View(HrGroup_DeltaTime_All)


HrGroup_DeltaTime_Above10Min <- filter(AllDays_NewOrder,
                                       abs(Delta_Time) >= 600
                                      ) %>% 
  group_by(Event_Time_HrGroup) %>% 
  summarise(EventAbove10_Cnt = n()
           )

str(HrGroup_DeltaTime_Above10Min)
View(HrGroup_DeltaTime_Above10Min)


HrGroup_DeltaTimeCompare <- inner_join(HrGroup_DeltaTime_Above10Min,
                                       HrGroup_DeltaTime_All,
                                       by = c("Event_Time_HrGroup" = "Event_Time_HrGroup")
                                      ) %>% 
  mutate(PctEventsAbove10 = EventAbove10_Cnt / EventAll_Cnt)

View(HrGroup_DeltaTimeCompare)

```


Investigation of Delta_Time (how early or late the bus is).

Quickly plot these "extreme" Delta_Times. 
```{r}

DeltTime_Above10_Cols <- ggplot(HrGroup_DeltaTimeCompare,
                                aes(factor(Event_Time_HrGroup),
                                    PctEventsAbove10
                                   )
                               ) +
  geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_text(aes(label = format(round(PctEventsAbove10, digits = 2),
                               nsmall = 2
                              )
               ),
            size = 3,
            nudge_y = (HrGroup_DeltaTimeCompare$PctEventsAbove10 * -0.1)
           ) +
  # coord_cartesian(xlim = c(-5, 5)) +
  xlab("Hour Group") + 
  ylab("Percent of All Bus Arrivals") +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  ggtitle(expression(atop("When is a Bus 10+ Minutes Late/Early"
                          # ,atop(italic("positive values are late arrivals"),
                          #      ""
                          #     )
                         )
                    )
         )

DeltTime_Above10_Cols

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Correlation.
```{r}

DwellTDeltaT_Corr <- as.matrix(cor(x = AllDays_NewOrder$Dwell_Time2,
                                   y = AllDays_NewOrder$Delta_Time,
                                   use = "pairwise"
                                  )
                               )

DwellTDeltaT_Corr

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Next, let's get a sample of data for plotting. Let's do this for the full dataset (AllDays_NewOrder).
```{r}

AllDays_NewOrder_10PctSamp <- sample_frac(AllDays_NewOrder, 0.1) %>% 
  select(Delta_Time,
         Dwell_Time2
        ) %>% 
  mutate(DataSet = "AllData")

str(AllDays_NewOrder_10PctSamp)

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Let's also get a sample of data for plotting, but with a datset that removes outliers.
```{r}

View(DeltTime_Pctiles)
View(DwellTime2_Pctiles)

AllDays_NewOrder_NoExtremes_10PctSamp <- filter(AllDays_NewOrder,
                                                between(Delta_Time, -402, 1705) & # removes about 2% of Delta_Time values
                                                  between(Dwell_Time2, 1, 63)  # removes about 2% of Dwell_Time2 values
                                               ) %>% 
  sample_frac(0.1) %>% 
  select(Delta_Time,
         Dwell_Time2
        ) %>% 
  mutate(DataSet = "OutliersRemoved")

str(AllDays_NewOrder_NoExtremes_10PctSamp)

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from the dataset that does not remove outliers.
```{r}

DwellTDeltaT_Scatter <- ggplot(AllDays_NewOrder_10PctSamp,
                               aes(Dwell_Time2, Delta_Time)
                              ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "red") +
  # xlab("Time at Stop (sec)") + 
  # ylab("Lateness (sec)") +
  annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
                          y = AllDays_NewOrder_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_10PctSamp$Dwell_Time2
                         ),
           x = 2200,
           y = 600,
           geom = "text",
           size = 3,
           colour = "red",
           parse = TRUE
          ) +
  labs(title = "Lateness vs Time at Stop",
       subtitle = "(no outliers removed)",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
  #                         ,atop(italic("(no outliers removed)"),
  #                               ""
  #                              )
  #                        )
  #                   )
  #        )
# +
#   geom_jitter()

DwellTDeltaT_Scatter

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from the dataset that does remove outliers.
```{r}

DwellTDeltaT_Scatter_NoExtremes <- ggplot(AllDays_NewOrder_NoExtremes_10PctSamp,
                                          aes(Dwell_Time2, Delta_Time)
                                         ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "blue") +
  # xlab("Time at Stop (sec)") + 
  # ylab("Lateness (sec)") +
  annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
                          y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
                         ),
           x = 50,
           y = -475,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  labs(title = "Lateness vs Time at Stop",
       subtitle = "(2% of outliers removed)",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
  #                         ,atop(italic("(2% of outliers removed)"),
  #                               ""
  #                              )
  #                        )
  #                   )
  #        )
# +
#   geom_jitter()

DwellTDeltaT_Scatter_NoExtremes

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from both datasets together.
```{r}

CombinedData <- rbind(AllDays_NewOrder_10PctSamp,
                      AllDays_NewOrder_NoExtremes_10PctSamp
                     )

CombinedData$DataSet <- factor(CombinedData$DataSet)

str(CombinedData)


DwellTDeltaT_Scatter_Combined <- ggplot(CombinedData,
                                        aes(x = Dwell_Time2,
                                            y = Delta_Time,
                                            colour = DataSet
                                           )
                                       ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  coord_cartesian(xlim = c(0, 500), ylim = c(-1000, 2000)
                 ) +
  geom_smooth(data = filter(CombinedData,
                            DataSet == "AllData"
                           ),
              method = "lm",
              colour = "red"
             ) +
  geom_smooth(data = filter(CombinedData,
                            DataSet == "OutliersRemoved"
                           ),
              method = "lm",
              colour = "blue"
             ) +
  # facet_wrap( ~ DataSet, ncol = 2) +
  annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
                          y = AllDays_NewOrder_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_10PctSamp$Dwell_Time2
                         ),
           x = 300,
           y = -600,
           geom = "text",
           size = 3,
           colour = "red",
           parse = TRUE
          ) +
  annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
                          y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
                         ),
           x = 300,
           y = -800,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  theme(legend.position = "bottom") +
  labs(title = "Lateness vs Time at Stop",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
                          # ,atop(italic("2% of outliers removed"),
                          #       ""
                          #      )
         #                 )
         #            )
         # )
# +
#   geom_jitter()

DwellTDeltaT_Scatter_Combined

```











Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file).
